diff --git a/.github/workflows/requirements-dev.txt b/.github/workflows/requirements-dev.txt new file mode 100644 index 0000000..4f1aa6e --- /dev/null +++ b/.github/workflows/requirements-dev.txt @@ -0,0 +1,5 @@ +# Copyright (C) 2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +pre-commit diff --git a/.github/workflows/static_checks.yaml b/.github/workflows/static_checks.yaml new file mode 100644 index 0000000..a16c84d --- /dev/null +++ b/.github/workflows/static_checks.yaml @@ -0,0 +1,73 @@ +# Copyright (C) 2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +name: Static code checks + +on: # yamllint disable-line rule:truthy + pull_request: + push: + branches: + - '**' + tags-ignore: + - '**' + +env: + LICENSE: AGPL-3.0-or-later + FETCH_DEPTH: 1 + FULL_HISTORY: 0 + SKIP_WORD_PRESENCE_CHECK: 0 + +jobs: + static-code-check: + if: endsWith(github.event.repository.name, 'private') + + name: Run static code checks + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + steps: + - name: Setup history + if: github.ref == 'refs/heads/oss' + run: | + echo "FETCH_DEPTH=0" >> $GITHUB_ENV + echo "FULL_HISTORY=1" >> $GITHUB_ENV + + - name: Setup version + if: github.ref == 'refs/heads/melco' + run: | + echo "SKIP_WORD_PRESENCE_CHECK=1" >> $GITHUB_ENV + + - name: Check out code + uses: actions/checkout@v3 + with: + fetch-depth: ${{ env.FETCH_DEPTH }} # '0' to check full history + + - name: Set up environment + run: git config user.email github-bot@merl.com + + - name: Set up python + uses: actions/setup-python@v4 + with: + python-version: 3 + cache: 'pip' + cache-dependency-path: '.github/workflows/requirements-dev.txt' + + - name: Install python packages + run: pip install -r .github/workflows/requirements-dev.txt + + - name: Check files + uses: merl-oss-private/merl-file-check-action@v1 + with: + license: ${{ env.LICENSE }} + full-history: ${{ env.FULL_HISTORY }} # If true, use fetch-depth 0 above + skip-word-presence-check: ${{ env.SKIP_WORD_PRESENCE_CHECK }} + + - name: Check license compatibility + if: github.ref != 'refs/heads/melco' + uses: merl-oss-private/merl_license_compatibility_checker@v1 + with: + input-filename: requirements.txt + license: ${{ env.LICENSE }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dfe4fc5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,168 @@ +# Copyright (C) 2023 Mitsubishi Electric Research Laboratories (MERL). +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +# Python .gitignore from https://github.com/github/gitignore/blob/main/Python.gitignore +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +# lib/ +# lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Custom ignores +.DS_Store diff --git a/.reuse/dep5 b/.reuse/dep5 new file mode 100644 index 0000000..a5e82f4 --- /dev/null +++ b/.reuse/dep5 @@ -0,0 +1,26 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ + +Files: .vscode/* images/task_avlen.png +Copyright: 2023 Mitsubishi Electric Research Laboratories (MERL) +License: AGPL-3.0-or-later + +# Except as noted in the files themselves (modified or written by MERL) +Files: soundspaces/* ss_baselines/* configs/* res/* scripts/AmbisonicBinauralizer +Copyright: Facebook, Inc. and its affiliates. +License: CC-BY-4.0 + +Files: habitat-lab-dialog/* +Copyright: Meta Platforms, Inc. and its affiliates. +License: MIT + +Files: ss_baselines/savi/dialog/speaker/* +Copyright: 2018, Daniel Fried, Ronghang Hu, Volkan Cirik, Anna Rohrbach, Jacob Andreas, Louis-Philippe Morency, Taylor Berg-Kirkpatrick, Kate Saenko, Dan Klein, Trevor Darrell +License: BSD-2-Clause + +Files: ss_baselines/savi/dialog/ques_gen/* +Copyright: 2019, Ranjay Krishna +License: MIT + +Files: ss_baselines/savi/dialog/speaker/pybind11/* +Copyright: 2016 Wenzel Jakob +License: BSD-3-Clause diff --git a/.vscode/README.md b/.vscode/README.md new file mode 100644 index 0000000..4eb80d7 --- /dev/null +++ b/.vscode/README.md @@ -0,0 +1,9 @@ +# VS Code recommended extensions and settings + +These files provide recommended extensions and workspace settings for VS Code for python development. The recommended extensions are: + +* [Python](https://marketplace.visualstudio.com/items?itemName=ms-python.python"): Official python extension from Microsoft +* [Python Type Hint](https://marketplace.visualstudio.com/items?itemName=njqdev.vscode-python-typehint): Type hint completion for Python +* [autoDocstring](https://marketplace.visualstudio.com/items?itemName=njpwerner.autodocstring): Generates python docstrings automatically + +If these extensions are not already globally installed, they will be recommended to you for installation when you open the project in VS Code. diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..2d42587 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,7 @@ +{ + "recommendations": [ + "ms-python.python", + "njqdev.vscode-python-typehint", + "njpwerner.autodocstring" + ] +} diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b5520a9 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,32 @@ +{ + "editor.rulers": [ + 120 + ], + "[python]": { + "editor.tabSize": 4 + }, + "[markdown]": { + "editor.wordWrap": "bounded", + "editor.wordWrapColumn": 120 + }, + "files.eol": "\n", + "files.insertFinalNewline": true, + "files.trimFinalNewlines": true, + "files.trimTrailingWhitespace": true, + "editor.formatOnSave": true, + "python.formatting.provider": "black", + "python.formatting.blackArgs": [ + "--line-length=120" + ], + "python.linting.flake8Enabled": true, + "python.linting.enabled": true, + "python.linting.flake8Args": [ + "--max-line-length=120", + "--extend-ignore=E203" + ], + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..08901d5 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,10 @@ + + +# Contributing + +Sorry, but we do not currently accept contributions in the form of pull requests to this repository. However, you are +welcome to post issues (bug reports, feature requests, questions, etc). diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..cba6f6a --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,660 @@ +### GNU AFFERO GENERAL PUBLIC LICENSE + +Version 3, 19 November 2007 + +Copyright (C) 2007 Free Software Foundation, Inc. + + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +### Preamble + +The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + +The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains +free software for all its users. + +When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + +Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + +A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + +The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + +An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing +under this license. + +The precise terms and conditions for copying, distribution and +modification follow. + +### TERMS AND CONDITIONS + +#### 0. Definitions. + +"This License" refers to version 3 of the GNU Affero General Public +License. + +"Copyright" also means copyright-like laws that apply to other kinds +of works, such as semiconductor masks. + +"The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + +To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of +an exact copy. The resulting work is called a "modified version" of +the earlier work or a work "based on" the earlier work. + +A "covered work" means either the unmodified Program or a work based +on the Program. + +To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + +To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user +through a computer network, with no transfer of a copy, is not +conveying. + +An interactive user interface displays "Appropriate Legal Notices" to +the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + +#### 1. Source Code. + +The "source code" for a work means the preferred form of the work for +making modifications to it. "Object code" means any non-source form of +a work. + +A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + +The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + +The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + +The Corresponding Source need not include anything that users can +regenerate automatically from other parts of the Corresponding Source. + +The Corresponding Source for a work in source code form is that same +work. + +#### 2. Basic Permissions. + +All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not convey, +without conditions so long as your license otherwise remains in force. +You may convey covered works to others for the sole purpose of having +them make modifications exclusively for you, or provide you with +facilities for running those works, provided that you comply with the +terms of this License in conveying all material for which you do not +control copyright. Those thus making or running the covered works for +you must do so exclusively on your behalf, under your direction and +control, on terms that prohibit them from making any copies of your +copyrighted material outside their relationship with you. + +Conveying under any other circumstances is permitted solely under the +conditions stated below. Sublicensing is not allowed; section 10 makes +it unnecessary. + +#### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + +No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + +When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such +circumvention is effected by exercising rights under this License with +respect to the covered work, and you disclaim any intention to limit +operation or modification of the work as a means of enforcing, against +the work's users, your or third parties' legal rights to forbid +circumvention of technological measures. + +#### 4. Conveying Verbatim Copies. + +You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + +You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + +#### 5. Conveying Modified Source Versions. + +You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these +conditions: + +- a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. +- b) The work must carry prominent notices stating that it is + released under this License and any conditions added under + section 7. This requirement modifies the requirement in section 4 + to "keep intact all notices". +- c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. +- d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + +A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + +#### 6. Conveying Non-Source Forms. + +You may convey a covered work in object code form under the terms of +sections 4 and 5, provided that you also convey the machine-readable +Corresponding Source under the terms of this License, in one of these +ways: + +- a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. +- b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the Corresponding + Source from a network server at no charge. +- c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. +- d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. +- e) Convey the object code using peer-to-peer transmission, + provided you inform other peers where the object code and + Corresponding Source of the work are being offered to the general + public at no charge under subsection 6d. + +A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + +A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, +family, or household purposes, or (2) anything designed or sold for +incorporation into a dwelling. In determining whether a product is a +consumer product, doubtful cases shall be resolved in favor of +coverage. For a particular product received by a particular user, +"normally used" refers to a typical or common use of that class of +product, regardless of the status of the particular user or of the way +in which the particular user actually uses, or expects or is expected +to use, the product. A product is a consumer product regardless of +whether the product has substantial commercial, industrial or +non-consumer uses, unless such uses represent the only significant +mode of use of the product. + +"Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to +install and execute modified versions of a covered work in that User +Product from a modified version of its Corresponding Source. The +information must suffice to ensure that the continued functioning of +the modified object code is in no case prevented or interfered with +solely because modification has been made. + +If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + +The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or +updates for a work that has been modified or installed by the +recipient, or for the User Product in which it has been modified or +installed. Access to a network may be denied when the modification +itself materially and adversely affects the operation of the network +or violates the rules and protocols for communication across the +network. + +Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + +#### 7. Additional Terms. + +"Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders +of that material) supplement the terms of this License with terms: + +- a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or +- b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or +- c) Prohibiting misrepresentation of the origin of that material, + or requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or +- d) Limiting the use for publicity purposes of names of licensors + or authors of the material; or +- e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or +- f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions + of it) with contractual assumptions of liability to the recipient, + for any liability that these contractual assumptions directly + impose on those licensors and authors. + +All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + +If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; the +above requirements apply either way. + +#### 8. Termination. + +You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + +However, if you cease all violation of this License, then your license +from a particular copyright holder is reinstated (a) provisionally, +unless and until the copyright holder explicitly and finally +terminates your license, and (b) permanently, if the copyright holder +fails to notify you of the violation by some reasonable means prior to +60 days after the cessation. + +Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + +#### 9. Acceptance Not Required for Having Copies. + +You are not required to accept this License in order to receive or run +a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + +#### 10. Automatic Licensing of Downstream Recipients. + +Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + +An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + +#### 11. Patents. + +A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + +A contributor's "essential patent claims" are all patent claims owned +or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + +In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + +If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + +A patent license is "discriminatory" if it does not include within the +scope of its coverage, prohibits the exercise of, or is conditioned on +the non-exercise of one or more of the rights that are specifically +granted under this License. You may not convey a covered work if you +are a party to an arrangement with a third party that is in the +business of distributing software, under which you make payment to the +third party based on the extent of your activity of conveying the +work, and under which the third party grants, to any of the parties +who would receive the covered work from you, a discriminatory patent +license (a) in connection with copies of the covered work conveyed by +you (or copies made from those copies), or (b) primarily for and in +connection with specific products or compilations that contain the +covered work, unless you entered into that arrangement, or that patent +license was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + +#### 12. No Surrender of Others' Freedom. + +If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under +this License and any other pertinent obligations, then as a +consequence you may not convey it at all. For example, if you agree to +terms that obligate you to collect a royalty for further conveying +from those to whom you convey the Program, the only way you could +satisfy both those terms and this License would be to refrain entirely +from conveying the Program. + +#### 13. Remote Network Interaction; Use with the GNU General Public License. + +Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your +version supports such interaction) an opportunity to receive the +Corresponding Source of your version by providing access to the +Corresponding Source from a network server at no charge, through some +standard or customary means of facilitating copying of software. This +Corresponding Source shall include the Corresponding Source for any +work covered by version 3 of the GNU General Public License that is +incorporated pursuant to the following paragraph. + +Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + +#### 14. Revised Versions of this License. + +The Free Software Foundation may publish revised and/or new versions +of the GNU Affero General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever +published by the Free Software Foundation. + +If the Program specifies that a proxy can decide which future versions +of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + +Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + +#### 15. Disclaimer of Warranty. + +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT +WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND +PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR +CORRECTION. + +#### 16. Limitation of Liability. + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR +CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES +ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT +NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR +LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM +TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER +PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +#### 17. Interpretation of Sections 15 and 16. + +If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + +END OF TERMS AND CONDITIONS + +### How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + +To do so, attach the following notices to the program. It is safest to +attach them to the start of each source file to most effectively state +the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper +mail. + +If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for +the specific requirements. + +You should also get your employer (if you work as a programmer) or +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. For more information on this, and how to apply and follow +the GNU AGPL, see . diff --git a/LICENSES/BSD-2-Clause.txt b/LICENSES/BSD-2-Clause.txt new file mode 100644 index 0000000..bc9cd16 --- /dev/null +++ b/LICENSES/BSD-2-Clause.txt @@ -0,0 +1,28 @@ +BSD 2-Clause License + +Copyright (c) 2018, Daniel Fried, Ronghang Hu, Volkan Cirik, Anna Rohrbach, +Jacob Andreas, Louis-Philippe Morency, Taylor Berg-Kirkpatrick, Kate Saenko, +Dan Klein, Trevor Darrell + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/LICENSES/BSD-3-Clause.txt b/LICENSES/BSD-3-Clause.txt new file mode 100644 index 0000000..ccf4e97 --- /dev/null +++ b/LICENSES/BSD-3-Clause.txt @@ -0,0 +1,36 @@ +Copyright (c) 2016 Wenzel Jakob , All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You are under no obligation whatsoever to provide any bug fixes, patches, or +upgrades to the features, functionality or performance of the source code +("Enhancements") to anyone; however, if you choose to make your Enhancements +available either publicly, or directly to the author of this software, without +imposing a separate written license agreement for such Enhancements, then you +hereby grant the following license: a non-exclusive, royalty-free perpetual +license to install, use, modify, prepare derivative works, incorporate into +other computer software, distribute, and sublicense such enhancements or +derivative works thereof, in binary and source code form. diff --git a/LICENSES/CC-BY-4.0.txt b/LICENSES/CC-BY-4.0.txt new file mode 100644 index 0000000..a6d7fd3 --- /dev/null +++ b/LICENSES/CC-BY-4.0.txt @@ -0,0 +1,384 @@ +Attribution 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution 4.0 International Public License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution 4.0 International Public License ("Public License"). To the +extent this Public License may be interpreted as a contract, You are +granted the Licensed Rights in consideration of Your acceptance of +these terms and conditions, and the Licensor grants You such rights in +consideration of benefits the Licensor receives from making the +Licensed Material available under these terms and conditions. + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + +b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + +c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + +d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + +e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + +f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + +g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + +h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + +i. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + +j. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + +k. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + +Section 2 -- Scope. + +a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part; and + + b. produce, reproduce, and Share Adapted Material. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + +b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties. + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + +a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + +a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database; + +b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + +c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + +a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + +b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + +c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + +Section 6 -- Term and Termination. + +a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + +b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + +c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + +d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + +Section 7 -- Other Terms and Conditions. + +a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + +b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + +Section 8 -- Interpretation. + +a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + +b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + +c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + +d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public licenses. +Notwithstanding, Creative Commons may elect to apply one of its public +licenses to material it publishes and in those instances will be +considered the "Licensor." Except for the limited purpose of indicating +that material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the public +licenses. + +Creative Commons may be contacted at creativecommons.org. diff --git a/LICENSES/MIT.txt b/LICENSES/MIT.txt new file mode 100644 index 0000000..e30a468 --- /dev/null +++ b/LICENSES/MIT.txt @@ -0,0 +1,57 @@ +List of all MIT licenses + +habitat-lab + +MIT License + +Copyright (c) Meta Platforms, Inc. and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +ques_gen + +COPYRIGHT + +Copyright (c) 2019, Ranjay Krishna. +All rights reserved. + +Each contributor holds copyright over their respective contributions. +The project versioning (Git) records all such contribution source information. + +LICENSE + +The MIT License (MIT) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0d22ded --- /dev/null +++ b/README.md @@ -0,0 +1,205 @@ + + +# AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments +![](./images/task_avlen.png) + +## Overview +This repository contains training and testing codes used in the NeurIPS 2022 paper 'AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments' by Sudipta Paul, Amit K. Roy-Chowdhury, and Anoop Cherian. + +Our code is built on [SoundSpaces 1.0](https://github.com/facebookresearch/sound-spaces). + +## Installation + +### AVLEN +1. `git clone https://github.com/merlresearch/avlen.git` + +2. `export ROOT=` + +3. Create a virtual env with python=3.7, this will be used throughout:\ + `conda create -n avlen_env python=3.7 cmake=3.14.0` + +4. The directories are assumed to be organized as follows: +``` +├── project + ├── avlen # ROOT directory + | |── habitat-lab-dialog # modified v0.1.7 of habitat-lab + | |── ... # other files and folders + | + └── habitat-sim # v0.1.7 of habitat-sim +``` +5. Install [habitat-lab-dialog](https://github.com/spaul007/habitat-lab-dialog.git) (modified version of habitat-lab v0.1.7). + +``` +cd $ROOT/habitat-lab-dialog +pip install -r requirements.txt +python setup.py develop --all # install habitat and habitat_baselines +``` + +6. Install [habitat-sim v0.1.7](https://github.com/facebookresearch/habitat-sim/tree/v0.1.7) (with `--headless` and `--with-cuda`) \ + `git clone --branch v0.1.7 https://github.com/facebookresearch/habitat-sim.git`\ + Check further instructions from [here](https://github.com/facebookresearch/habitat-sim/blob/v0.1.7/BUILD_FROM_SOURCE.md) + +7. Set `PYTHONPATH=$PYTHONPATH:$ROOT/habitat-lab-dialog` + +8. Install `avlen` repo into pip by running the following command: +``` +cd $ROOT +pip install -e . +``` +9. Follow instructions on the [dataset](https://github.com/facebookresearch/sound-spaces/tree/main/soundspaces) page to download the rendered audio data and datasets and put them under `$ROOT/data/` folder. + +10. Add connectivity files by from [here](https://drive.google.com/drive/folders/11GjL3RZnbRPGUv05wzP9sxSYjJVRCEJ6?usp=sharing) to ` $ROOT/connectivity/` + +11. `export PYTHONPATH=$PYTHONPATH:` + +12. Download the repurposed dataset for training language-based policy $\pi_l$ from [here](https://drive.google.com/drive/folders/1zGLDG3vxeETO13dBQde2H5qmxgKiAlJH?usp=sharing) and place it to `$ROOT/data/datasets/semantic_audionav_dialog_approx/` + +13. Install CLIP from [here](https://github.com/openai/CLIP) + +14. Download `node2view.json` and `view2node.json` from [here](https://drive.google.com/drive/folders/1TjnFdupuC7dEVnmz9-6gWPqx1gCGXApB?usp=sharing) and place it in `$ROOT/data` folder. + +### Speaker (Location: $ROOT/ss_baselines/savi/dialog/speaker) + +1. Compile the Matterport3D Simulator: +``` +cd $ROOT/ss_baselines/savi/dialog/speaker +mkdir build && cd build +cmake -DPYTHON_EXECUTABLE:FILEPATH=/path/to/your/bin/python .. +make +cd ../ +``` +This will install v0.1. Use `-DPYTHON_EXECUTABLE` if you want to build with specific virtual env, otherwise just use `cmake ..` \ +Check [Matterport3D Simulator v0.1](https://github.com/peteanderson80/Matterport3DSimulator/tree/v0.1) for further dependency installation. + +2. Download the precomputed ResNet Image Features from [here](https://www.dropbox.com/s/o57kxh2mn5rkx4o/ResNet-152-imagenet.zip?dl=1) and place it to `$ROOT/ss_baselines/savi/dialog/speaker/img_features/` +``` +mkdir -p $ROOT/ss_baselines/savi/dialog/speaker/img_features/ +cd $ROOT/ss_baselines/savi/dialog/speaker/img_features/ +wget https://url/ResNet-152-imagenet.zip -O ResNet-152-imagenet.zip +unzip ResNet-152-imagenet.zip +``` + +3. Download FGR2R dataset from [here](https://github.com/YicongHong/Fine-Grained-R2R) and place it to `$ROOT/ss_baselines/savi/dialog/speaker/tasks/R2R/data/` + +4. Download pretrained weights of the speaker model from [here](http://url/speaker_model_weights.zip). Unzip it and place the two files in `$ROOT/ss_baselines/savi/dialog/speaker/tasks/R2R/speaker/snapshots/` + +### May also require to Install +- CUDA compatible pytorch version (>=1.7.1) +- torchtext compatible with pytorch version + +## Pretrained Weights for $\pi_g$ and $\pi_l$ + +Download the weights from [here](http://url/pretrained_weights.zip), unzip it and place the two folders: (i) semantic_audionav and (ii) semantic_audionav_distractor at `$ROOT/data/pretrained_weights/` + + +## Instruction on Training AVLEN: + +To use multiple GPUs, submit slurm.sh from the `$ROOT` dir. `slurm.sh` file is in `$ROOT/ss_baselines/savi/`. To submit slurm: +- Edit the python command based on the type of training +- Change master port to run multiple instance at the same time + +### Heard/Unheard Sound + +There are two stages of training: +- 1st stage: does not uses history information +- 2nd stage: uses the history information + +#### 1st stage: +``` +python ss_baselines/savi/run.py --exp-config ss_baselines/savi/config/semantic_audionav/savi_interactive_1st_stage.yaml --model-dir data/models/AVLEN RL.SOFT_QUERY_REWARD True ALLOW_STOP True RL.QUERY_REWARD -1.2 RL.CONSECUTIVE_REWARD -0.5 REPLAY_STORE True +``` + +#### 2nd stage: +``` +python ss_baselines/savi/run.py --exp-config ss_baselines/savi/config/semantic_audionav/savi_interactive_2nd_stage.yaml --model-dir data/models/AVLEN RL.SOFT_QUERY_REWARD True ALLOW_STOP True RL.QUERY_REWARD -1.2 RL.CONSECUTIVE_REWARD -0.5 RESUME_CHECKPOINT True +``` + +For the first stage training use `savi_interactive_1st_stage.yaml` and for the second stage training use `savi_interactive_2nd_stage.yaml`. + +### Distractor Sound: + +#### 1st stage: +``` +python ss_baselines/savi/run.py --exp-config ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_1st_stage.yaml --model-dir data/models/AVLEN_dis RL.SOFT_QUERY_REWARD True ALLOW_STOP True RL.QUERY_REWARD -1.2 RL.CONSECUTIVE_REWARD -0.5 +``` +#### 2nd stage: +``` +python ss_baselines/savi/run.py --exp-config ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_2nd_stage.yaml --model-dir data/models/AVLEN_dis RL.SOFT_QUERY_REWARD True ALLOW_STOP True RL.QUERY_REWARD -1.2 RL.CONSECUTIVE_REWARD -1.0 RESUME_CHECKPOINT True +``` + +## Instruction on Training $\pi_l$ + +Trained using single gpu +``` +python ss_baselines/savi/run.py --exp-config ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training.yaml --model-dir data/models/AVLEN_VLN +``` + +## Pretrained Weights for $\pi_q$ + +Download the weights from [here](http://url/ckpt.119.pth) and place it as `$ROOT/data/models/AVLEN/data/ckpt.119.pth`(from general) or `$ROOT/data/models/AVLEN_dis/data/ckpt.119.pth` (from distractor). (considering `$ROOT/data/models/AVLEN/` and `$ROOT/data/models/AVLEN_dis/` are the model directories for general and distractor case respectively) + + +## Instruction on Testing AVLEN + +To evaluate a single checkpoint, indicate the checkpoint path for `EVAL_CKPT_PATH_DIR` + +### Unheard and Heard + +``` +python ss_baselines/savi/run.py --run-type eval --exp-config ss_baselines/savi/config/semantic_audionav/savi_pretraining_interactive.yaml EVAL_CKPT_PATH_DIR EVAL.SPLIT test USE_SYNC_VECENV True RL.DDPPO.pretrained False +``` +Above mentioned command will use 'unheard' sound. If you want to use 'heard' sound, update L214 of `$ROOT/soundspaces/tasks/semantic_audionav_task.py` + +### Distractor +``` +python ss_baselines/savi/run.py --run-type eval --exp-config ss_baselines/savi/config/semantic_audionav_distractor/savi_pretraining_interactive.yaml EVAL_CKPT_PATH_DIR EVAL.SPLIT test_distractor USE_SYNC_VECENV True RL.DDPPO.pretrained False +``` + +## Instruction on Testing $\pi_l$ + +``` +python ss_baselines/savi/run.py --run-type eval --exp-config ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training.yaml EVAL_CKPT_PATH_DIR val USE_SYNC_VECENV True RL.DDPPO.pretrained False +``` + +## Contact +Anoop Cherian, cherian@merl.com or Sudipta Paul, spaul007@ucr.edu. + +## Citation +``` +@article{paul2022avlen, + title={AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments}, + author={Paul, Sudipta and Roy-Chowdhury, Amit K and Cherian, Anoop}, + journal={arXiv preprint arXiv:2210.07940}, + year={2022} +} +``` + +## Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) for our policy on contributions. + +## License + +Released under `AGPL-3.0-or-later` license, as found in the [LICENSE.md](LICENSE.md) file. + +All files, except as noted below: + +``` +Copyright (c) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) + +SPDX-License-Identifier: AGPL-3.0-or-later +``` + +`SoundSpaces` was adapted from https://github.com/facebookresearch/sound-spaces (`CC-BY-4.0` license as found in [LICENSES/CC-BY-4.0.txt](LICENSES/CC-BY-4.0.txt)). + +`Habitat Lab Dialog` was adapted from https://github.com/facebookresearch/habitat-lab/tree/v0.1.7 (`MIT` License as found in [LICENSES/MIT.txt](LICENSES/MIT.txt)). + +`ss_baselines/savi/dialog/speaker` was adapted from https://github.com/ronghanghu/speaker_follower/blob/master/ (`BSD-2-Clause` license as found in [LICENSES/BSD-2-Clause.txt](LICENSES/BSD-2-Clause.txt)). + +`ss_baselines/savi/dialog/ques_gen` was adapted from https://github.com/ranjaykrishna/iq/ (`MIT` license as found in [LICENSES/MIT.txt](LICENSES/MIT.txt)). + +`ss_baselines/savi/dialog/speaker/pybind11` is from https://github.com/pybind/pybind11/ (`BSD-3-Clause` license as found in [LICENSES/BSD-3-Clause.txt](LICENSES/BSD-3-Clause.txt)). diff --git a/configs/audionav/av_nav/mp3d/audiogoal.yaml b/configs/audionav/av_nav/mp3d/audiogoal.yaml new file mode 100644 index 0000000..019b9f7 --- /dev/null +++ b/configs/audionav/av_nav/mp3d/audiogoal.yaml @@ -0,0 +1,43 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['SPECTROGRAM_SENSOR'] + GOAL_SENSOR_UUID: spectrogram + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_nav/mp3d/audiopointgoal.yaml b/configs/audionav/av_nav/mp3d/audiopointgoal.yaml new file mode 100644 index 0000000..cf286bc --- /dev/null +++ b/configs/audionav/av_nav/mp3d/audiopointgoal.yaml @@ -0,0 +1,46 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR', 'SPECTROGRAM_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass,spectrogram + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" \ No newline at end of file diff --git a/configs/audionav/av_nav/mp3d/interactive_demo.yaml b/configs/audionav/av_nav/mp3d/interactive_demo.yaml new file mode 100644 index 0000000..4a18f7c --- /dev/null +++ b/configs/audionav/av_nav/mp3d/interactive_demo.yaml @@ -0,0 +1,46 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 1024 + HEIGHT: 1024 + DEPTH_SENSOR: + WIDTH: 1024 + HEIGHT: 1024 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + USE_RENDERED_OBSERVATIONS: False + CONTINUOUS_VIEW_CHANGE: True + VIEW_CHANGE_FPS: 30 + AUDIO: + RIR_SAMPLING_RATE: 16000 + SOURCE_SOUND_DIR: "data/sounds/1s_all" + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['AUDIOGOAL_SENSOR'] + GOAL_SENSOR_UUID: audiogoal + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: False + +DATASET: + TYPE: "AudioNav" + SPLIT: "val_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_nav/mp3d/pointgoal.yaml b/configs/audionav/av_nav/mp3d/pointgoal.yaml new file mode 100644 index 0000000..5c3cb9e --- /dev/null +++ b/configs/audionav/av_nav/mp3d/pointgoal.yaml @@ -0,0 +1,44 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: "v1" + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_nav/mp3d/pointgoal_question.yaml b/configs/audionav/av_nav/mp3d/pointgoal_question.yaml new file mode 100644 index 0000000..4b4bd6e --- /dev/null +++ b/configs/audionav/av_nav/mp3d/pointgoal_question.yaml @@ -0,0 +1,44 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 224 + HEIGHT: 224 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: "v1" + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_nav/replica/audiogoal.yaml b/configs/audionav/av_nav/replica/audiogoal.yaml new file mode 100644 index 0000000..1b50b3e --- /dev/null +++ b/configs/audionav/av_nav/replica/audiogoal.yaml @@ -0,0 +1,43 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "replica" + AUDIO: + RIR_SAMPLING_RATE: 44100 + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['SPECTROGRAM_SENSOR'] + GOAL_SENSOR_UUID: spectrogram + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_nav/replica/audiopointgoal.yaml b/configs/audionav/av_nav/replica/audiopointgoal.yaml new file mode 100644 index 0000000..08007ba --- /dev/null +++ b/configs/audionav/av_nav/replica/audiopointgoal.yaml @@ -0,0 +1,45 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "replica" + AUDIO: + RIR_SAMPLING_RATE: 44100 + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR', 'SPECTROGRAM_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass,spectrogram + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" \ No newline at end of file diff --git a/configs/audionav/av_nav/replica/interactive_demo.yaml b/configs/audionav/av_nav/replica/interactive_demo.yaml new file mode 100644 index 0000000..674fc2d --- /dev/null +++ b/configs/audionav/av_nav/replica/interactive_demo.yaml @@ -0,0 +1,45 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 1024 + HEIGHT: 1024 + DEPTH_SENSOR: + WIDTH: 1024 + HEIGHT: 1024 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + USE_RENDERED_OBSERVATIONS: False + CONTINUOUS_VIEW_CHANGE: True + VIEW_CHANGE_FPS: 30 + AUDIO: + RIR_SAMPLING_RATE: 44100 + SOURCE_SOUND_DIR: "data/sounds/demo" + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['AUDIOGOAL_SENSOR'] + GOAL_SENSOR_UUID: audiogoal + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: False + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_nav/replica/pointgoal.yaml b/configs/audionav/av_nav/replica/pointgoal.yaml new file mode 100644 index 0000000..97558e6 --- /dev/null +++ b/configs/audionav/av_nav/replica/pointgoal.yaml @@ -0,0 +1,44 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "replica" + +TASK: + TYPE: AudioNav + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + SUCCESS_DISTANCE: 0.2 + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_SOURCE_AND_TARGET: True + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: "v1" + SCENE_DATASET: "replica" + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" \ No newline at end of file diff --git a/configs/audionav/av_wan/mp3d/audiogoal.yaml b/configs/audionav/av_wan/mp3d/audiogoal.yaml new file mode 100644 index 0000000..769855c --- /dev/null +++ b/configs/audionav/av_wan/mp3d/audiogoal.yaml @@ -0,0 +1,60 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "GEOMETRIC_MAP", "ACTION_MAP", 'COLLISION', 'ACOUSTIC_MAP', 'INTENSITY', 'AUDIOGOAL_SENSOR'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + + GEOMETRIC_MAP: + MAP_SIZE: 400 + INTERNAL_MAP_SIZE: 1200 + MAP_RESOLUTION: 0.1 + ACOUSTIC_MAP: + MAP_SIZE: 20 + MAP_RESOLUTION: 1.0 + ACTION_MAP: + MAP_SIZE: 9 + MAP_RESOLUTION: 1.0 +# EGOMAP_SENSOR: +# MAP_SIZE: 15 +# MAP_RESOLUTION: 0.2 + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_wan/mp3d/audiogoal_without_am.yaml b/configs/audionav/av_wan/mp3d/audiogoal_without_am.yaml new file mode 100644 index 0000000..37d6267 --- /dev/null +++ b/configs/audionav/av_wan/mp3d/audiogoal_without_am.yaml @@ -0,0 +1,60 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "GEOMETRIC_MAP", "ACTION_MAP", 'COLLISION'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + + GEOMETRIC_MAP: + MAP_SIZE: 400 + INTERNAL_MAP_SIZE: 1200 + MAP_RESOLUTION: 0.1 + ACOUSTIC_MAP: + MAP_SIZE: 20 + MAP_RESOLUTION: 1.0 + ACTION_MAP: + MAP_SIZE: 9 + MAP_RESOLUTION: 1.0 +# EGOMAP_SENSOR: +# MAP_SIZE: 15 +# MAP_RESOLUTION: 0.2 + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_wan/mp3d/audiogoal_without_both.yaml b/configs/audionav/av_wan/mp3d/audiogoal_without_both.yaml new file mode 100644 index 0000000..5fb5548 --- /dev/null +++ b/configs/audionav/av_wan/mp3d/audiogoal_without_both.yaml @@ -0,0 +1,60 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "ACTION_MAP", 'COLLISION'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + + GEOMETRIC_MAP: + MAP_SIZE: 400 + INTERNAL_MAP_SIZE: 1200 + MAP_RESOLUTION: 0.1 + ACOUSTIC_MAP: + MAP_SIZE: 20 + MAP_RESOLUTION: 1.0 + ACTION_MAP: + MAP_SIZE: 9 + MAP_RESOLUTION: 1.0 +# EGOMAP_SENSOR: +# MAP_SIZE: 15 +# MAP_RESOLUTION: 0.2 + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_wan/mp3d/audiogoal_without_gm.yaml b/configs/audionav/av_wan/mp3d/audiogoal_without_gm.yaml new file mode 100644 index 0000000..6153beb --- /dev/null +++ b/configs/audionav/av_wan/mp3d/audiogoal_without_gm.yaml @@ -0,0 +1,60 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "ACOUSTIC_MAP", "ACTION_MAP", 'COLLISION', 'INTENSITY'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + + GEOMETRIC_MAP: + MAP_SIZE: 400 + INTERNAL_MAP_SIZE: 1200 + MAP_RESOLUTION: 0.1 + ACOUSTIC_MAP: + MAP_SIZE: 20 + MAP_RESOLUTION: 1.0 + ACTION_MAP: + MAP_SIZE: 9 + MAP_RESOLUTION: 1.0 +# EGOMAP_SENSOR: +# MAP_SIZE: 15 +# MAP_RESOLUTION: 0.2 + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_wan/replica/audiogoal.yaml b/configs/audionav/av_wan/replica/audiogoal.yaml new file mode 100644 index 0000000..7b43f3b --- /dev/null +++ b/configs/audionav/av_wan/replica/audiogoal.yaml @@ -0,0 +1,45 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "replica" + AUDIO: + RIR_SAMPLING_RATE: 44100 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "GEOMETRIC_MAP", "ACTION_MAP", 'COLLISION', 'ACOUSTIC_MAP', 'INTENSITY'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_wan/replica/audiogoal_without_am.yaml b/configs/audionav/av_wan/replica/audiogoal_without_am.yaml new file mode 100644 index 0000000..bb9fb11 --- /dev/null +++ b/configs/audionav/av_wan/replica/audiogoal_without_am.yaml @@ -0,0 +1,45 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "replica" + AUDIO: + RIR_SAMPLING_RATE: 44100 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "GEOMETRIC_MAP", "ACTION_MAP", 'COLLISION', 'INTENSITY'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_wan/replica/audiogoal_without_both.yaml b/configs/audionav/av_wan/replica/audiogoal_without_both.yaml new file mode 100644 index 0000000..c71127e --- /dev/null +++ b/configs/audionav/av_wan/replica/audiogoal_without_both.yaml @@ -0,0 +1,45 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "replica" + AUDIO: + RIR_SAMPLING_RATE: 44100 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "ACTION_MAP", 'COLLISION', 'INTENSITY'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" diff --git a/configs/audionav/av_wan/replica/audiogoal_without_gm.yaml b/configs/audionav/av_wan/replica/audiogoal_without_gm.yaml new file mode 100644 index 0000000..92909cf --- /dev/null +++ b/configs/audionav/av_wan/replica/audiogoal_without_gm.yaml @@ -0,0 +1,45 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 +# NOISE_MODEL: RedwoodDepthNoiseModel + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "replica" + AUDIO: + RIR_SAMPLING_RATE: 44100 + +TASK: + TYPE: AudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "ACOUSTIC_MAP", "ACTION_MAP", 'COLLISION', 'INTENSITY'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: POINT + +DATASET: + TYPE: "AudioNav" + SPLIT: "train_telephone" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/replica" + DATA_PATH: "data/datasets/audionav/replica/{version}/{split}/{split}.json.gz" diff --git a/configs/semantic_audionav/av_nav/mp3d/semantic_audiogoal.yaml b/configs/semantic_audionav/av_nav/mp3d/semantic_audiogoal.yaml new file mode 100644 index 0000000..3672ede --- /dev/null +++ b/configs/semantic_audionav/av_nav/mp3d/semantic_audiogoal.yaml @@ -0,0 +1,47 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + EVERLASTING: False + RIR_SAMPLING_RATE: 16000 + SOURCE_SOUND_DIR: "data/sounds/semantic_splits" + +TASK: + TYPE: SemanticAudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', 'ORACLE_ACTION_SENSOR'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION', 'SUCCESS_WHEN_SILENT'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + +DATASET: + TYPE: "SemanticAudioNav" + SPLIT: "train" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/semantic_audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/semantic_audionav/av_nav/mp3d/semantic_audiogoal_with_label.yaml b/configs/semantic_audionav/av_nav/mp3d/semantic_audiogoal_with_label.yaml new file mode 100644 index 0000000..f936243 --- /dev/null +++ b/configs/semantic_audionav/av_nav/mp3d/semantic_audiogoal_with_label.yaml @@ -0,0 +1,43 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + SOURCE_SOUND_DIR: "data/sounds/semantic_splits" + +TASK: + TYPE: SemanticAudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'CATEGORY'] + GOAL_SENSOR_UUID: spectrogram + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + +DATASET: + TYPE: "SemanticAudioNav" + SPLIT: "train" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/semantic_audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/semantic_audionav/av_wan/mp3d/semantic_audiogoal.yaml b/configs/semantic_audionav/av_wan/mp3d/semantic_audiogoal.yaml new file mode 100644 index 0000000..56222ce --- /dev/null +++ b/configs/semantic_audionav/av_wan/mp3d/semantic_audiogoal.yaml @@ -0,0 +1,57 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + SOURCE_SOUND_DIR: "data/sounds/semantic_splits" + +TASK: + TYPE: SemanticAudioNav + + SENSORS: ['AUDIOGOAL_SENSOR', 'SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "GEOMETRIC_MAP", "ACTION_MAP", 'COLLISION', 'ACOUSTIC_MAP', 'INTENSITY'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION', 'SUCCESS_WHEN_SILENT'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + + GEOMETRIC_MAP: + MAP_SIZE: 400 + INTERNAL_MAP_SIZE: 1200 + MAP_RESOLUTION: 0.1 + ACOUSTIC_MAP: + MAP_SIZE: 20 + MAP_RESOLUTION: 1.0 + ACTION_MAP: + MAP_SIZE: 9 + MAP_RESOLUTION: 1.0 + +DATASET: + TYPE: "SemanticAudioNav" + SPLIT: "train" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/semantic_audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/semantic_audionav/av_wan/mp3d/semantic_audiogoal_with_label.yaml b/configs/semantic_audionav/av_wan/mp3d/semantic_audiogoal_with_label.yaml new file mode 100644 index 0000000..c1792f3 --- /dev/null +++ b/configs/semantic_audionav/av_wan/mp3d/semantic_audiogoal_with_label.yaml @@ -0,0 +1,57 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + RIR_SAMPLING_RATE: 16000 + SOURCE_SOUND_DIR: "data/sounds/semantic_splits" + +TASK: + TYPE: SemanticAudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'EGOMAP_SENSOR', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', "GEOMETRIC_MAP", "ACTION_MAP", 'COLLISION', 'ACOUSTIC_MAP', 'INTENSITY', 'CATEGORY'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION', 'SUCCESS_WHEN_SILENT'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + + GEOMETRIC_MAP: + MAP_SIZE: 400 + INTERNAL_MAP_SIZE: 1200 + MAP_RESOLUTION: 0.1 + ACOUSTIC_MAP: + MAP_SIZE: 20 + MAP_RESOLUTION: 1.0 + ACTION_MAP: + MAP_SIZE: 9 + MAP_RESOLUTION: 1.0 + +DATASET: + TYPE: "SemanticAudioNav" + SPLIT: "train" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/semantic_audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml b/configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml new file mode 100644 index 0000000..cbfb5c0 --- /dev/null +++ b/configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml @@ -0,0 +1,47 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + EVERLASTING: False + RIR_SAMPLING_RATE: 16000 + SOURCE_SOUND_DIR: "data/sounds/semantic_splits" + +TASK: + TYPE: SemanticAudioNav + + SENSORS: ['AUDIOGOAL_SENSOR', 'SPECTROGRAM_SENSOR', 'CATEGORY', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', 'POSE_SENSOR', 'LOCATION_BELIEF', 'CATEGORY_BELIEF'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION', 'SUCCESS_WHEN_SILENT'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + +DATASET: + TYPE: "SemanticAudioNav" + SPLIT: "train" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/semantic_audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/semantic_audionav/savi/mp3d/semantic_audiogoal_dialog.yaml b/configs/semantic_audionav/savi/mp3d/semantic_audiogoal_dialog.yaml new file mode 100644 index 0000000..f004835 --- /dev/null +++ b/configs/semantic_audionav/savi/mp3d/semantic_audiogoal_dialog.yaml @@ -0,0 +1,51 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + EVERLASTING: False + RIR_SAMPLING_RATE: 16000 + SOURCE_SOUND_DIR: "data/sounds/semantic_splits" + +TASK: + TYPE: SemanticAudioNav + + SENSORS: ['AUDIOGOAL_SENSOR', 'SPECTROGRAM_SENSOR', 'CATEGORY', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', 'POSE_SENSOR', 'LOCATION_BELIEF', 'CATEGORY_BELIEF'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION', 'SUCCESS_WHEN_SILENT'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + +DATASET: + TYPE: "SemanticAudioNav" + SPLIT: "train" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/semantic_audionav_dialog_approx/mp3d/{version}/{split}/{split}.json.gz" diff --git a/configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml b/configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml new file mode 100644 index 0000000..1d72db8 --- /dev/null +++ b/configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml @@ -0,0 +1,51 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 128 + HEIGHT: 128 + DEPTH_SENSOR: + WIDTH: 128 + HEIGHT: 128 + SEMANTIC_SENSOR: + WIDTH: 128 + HEIGHT: 128 + + TYPE: "SoundSpacesSim" + ACTION_SPACE_CONFIG: "v0" + SCENE_DATASET: "mp3d" + GRID_SIZE: 1.0 + AUDIO: + EVERLASTING: False + RIR_SAMPLING_RATE: 16000 + HAS_DISTRACTOR_SOUND: True + SOURCE_SOUND_DIR: "data/sounds/semantic_splits" + +TASK: + TYPE: SemanticAudioNav + + SENSORS: ['SPECTROGRAM_SENSOR', 'CATEGORY', 'POINTGOAL_WITH_GPS_COMPASS_SENSOR', 'POSE_SENSOR', 'LOCATION_BELIEF', 'CATEGORY_BELIEF'] + GOAL_SENSOR_UUID: spectrogram + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "CARTESIAN" + DIMENSIONALITY: 2 + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'NORMALIZED_DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL', 'NUM_ACTION', 'SUCCESS_WEIGHTED_BY_NUM_ACTION', 'SUCCESS_WHEN_SILENT'] + SPL: + TYPE: SPL + TOP_DOWN_MAP: + MAP_RESOLUTION: 10000 + DRAW_BORDER: True + DRAW_SHORTEST_PATH: True + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + +DATASET: + TYPE: "SemanticAudioNav" + SPLIT: "train_distractor" + CONTENT_SCENES: ["*"] + VERSION: 'v1' + SCENES_DIR: "data/scene_datasets/mp3d" + DATA_PATH: "data/datasets/semantic_audionav/mp3d/{version}/{split}/{split}.json.gz" diff --git a/habitat-lab-dialog/.circleci/config.yml b/habitat-lab-dialog/.circleci/config.yml new file mode 100644 index 0000000..9c6690f --- /dev/null +++ b/habitat-lab-dialog/.circleci/config.yml @@ -0,0 +1,278 @@ +version: 2 +gpu: &gpu + machine: + image: ubuntu-1604-cuda-10.1:201909-23 + resource_class: gpu.small + environment: + FPS_THRESHOLD: 900 + +jobs: + python_lint: + docker: + - image: circleci/python:3.6 + steps: + - checkout + - run: + name: setup + command: | + sudo pip install black flake8 flake8-builtins flake8-bugbear flake8-comprehensions flake8-return flake8-simplify "isort[pyproject]" numpy --progress-bar off + sudo pip install -r requirements.txt --progress-bar off + - run: + name: run black + command: | + black --exclude '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist)|examples/tutorials/(colabs|nb_python)' habitat/. habitat_baselines/. examples/. test/. setup.py --diff + black --exclude '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|_build|buck-out|build|dist)|examples/tutorials/(colabs|nb_python)' habitat/. habitat_baselines/. examples/. test/. setup.py --check + - run: + name: run isort + command: | + isort --version + isort habitat/. habitat_baselines/. examples/. test/. setup.py --diff + isort habitat/. habitat_baselines/. examples/. test/. setup.py --check-only + - run: + name: run flake8 + command: | + flake8 --version + flake8 habitat/. habitat_baselines/. examples/. tests/. setup.py + pre-commit: + docker: + - image: circleci/python:3.6 + working_directory: ~/repo/ + + steps: + - checkout + - run: + name: Combine precommit config and python versions for caching + command: | + cat .pre-commit-config.yaml > pre-commit-deps.txt + python -VV >> pre-commit-deps.txt + - restore_cache: + keys: + - v1-precommit-deps-{{ checksum "pre-commit-deps.txt" }} + + - run: + name: Install Dependencies + command: | + sudo pip install -U pip setuptools pre-commit + # Install the hooks now so that they'll be cached + pre-commit install-hooks + + - save_cache: + paths: + - ~/.cache/pre-commit + key: v1-precommit-deps-{{ checksum "pre-commit-deps.txt" }} + + - run: + name: Check Code Style using pre-commit + command: | + SKIP=clang-format,eslint pre-commit run --show-diff-on-failure --all-files + install_and_test_ubuntu: + <<: *gpu + steps: + - checkout: + path: ./habitat-lab + - run: + name: Install cmake + no_output_timeout: 5m + command: | + echo $(git ls-remote https://github.com/facebookresearch/habitat-sim.git HEAD | awk '{ print $1}') > ./hsim_sha + wget https://github.com/Kitware/CMake/releases/download/v3.13.4/cmake-3.13.4-Linux-x86_64.sh + sudo mkdir /opt/cmake + sudo sh ./cmake-3.13.4-Linux-x86_64.sh --prefix=/opt/cmake --skip-license + sudo ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake + - run: + name: Install dependencies + no_output_timeout: 20m + command: | + sudo apt-get update || true + sudo apt-get install -y --no-install-recommends \ + build-essential \ + git \ + curl \ + vim \ + ca-certificates \ + libbullet-dev \ + libjpeg-dev \ + libglm-dev \ + libegl1-mesa-dev \ + xorg-dev \ + freeglut3-dev \ + pkg-config \ + wget \ + zip \ + unzip || true + sudo apt install --allow-change-held-packages \ + texlive-base \ + texlive-latex-extra \ + texlive-fonts-extra \ + texlive-fonts-recommended + - run: + name: Check CUDA + no_output_timeout: 20m + background: true + command: | + # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.44-1_amd64.deb + # sudo dpkg -i cuda-repo-ubuntu1604_8.0.44-1_amd64.deb + # sudo apt-get update || true + # sudo apt-get --yes --force-yes install cuda + # touch ./cuda_installed + nvidia-smi + - restore_cache: + keys: + - conda-{{ checksum "habitat-lab/.circleci/config.yml" }} + - run: + name: Install conda and dependencies + no_output_timeout: 20m + command: | + if [ ! -d ~/miniconda ] + then + curl -o ~/miniconda.sh -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh + chmod +x ~/miniconda.sh + ~/miniconda.sh -b -p $HOME/miniconda + rm ~/miniconda.sh + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + conda create -y -n habitat python=3.6 + . activate habitat + conda install -q -y -c conda-forge ninja ccache numpy pytest pytest-mock pytest-cov + pip install pytest-sugar + fi + - run: + name: Install pytorch + no_output_timeout: 20m + background: true + command: | + if [ ! -f ~/miniconda/pytorch_installed ] + then + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; + conda install -c conda-forge opencv -y + conda install -y pytorch torchvision cudatoolkit=10.0 -c pytorch + fi + touch ~/miniconda/pytorch_installed + - restore_cache: + keys: + - habitat-sim-{{ checksum "./hsim_sha" }} + - restore_cache: + keys: + - ccache-{{ arch }}-master + paths: + - /home/circleci/.ccache + - run: + name: CCache initialization + command: | + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; + ccache --show-stats + ccache --zero-stats + ccache --max-size=10.0G + - run: + name: Build, install habitat-sim and run benchmark + no_output_timeout: 30m + command: | + if [ ! -d ./habitat-sim ] + then + git clone https://github.com/facebookresearch/habitat-sim.git --recursive + fi + # while [ ! -f ./cuda_installed ]; do sleep 2; done # wait for CUDA + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; + cd habitat-sim + pip install -r requirements.txt --progress-bar off + python -u setup.py install --headless --with-cuda --bullet + - run: + name: Ccache stats + when: always + command: | + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; + ccache --show-stats + - run: + name: Download test data + command: | + if [ ! -f ./habitat-sim/data/scene_datasets/habitat-test-scenes/van-gogh-room.glb ] + then + cd habitat-sim + wget http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes.zip + unzip habitat-test-scenes.zip + rm habitat-test-scenes.zip + fi + - run: + name: Download coda scene + command: | + if [ ! -f ./habitat-sim/data/scene_datasets/coda/coda.glb ] + then + cd habitat-sim + wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Pc-J6pZzXEd8RSeLM94t3iwO8q_RQ853' -O coda.zip + unzip coda.zip -d data/scene_datasets + rm coda.zip + fi + - run: + name: Run sim benchmark + command: | + # while [ ! -f ./cuda_installed ]; do sleep 2; done # wait for CUDA + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; cd habitat-sim + python examples/example.py --scene data/scene_datasets/habitat-test-scenes/van-gogh-room.glb --silent --test_fps_regression $FPS_THRESHOLD + - save_cache: + key: habitat-sim-{{ checksum "./hsim_sha" }} + background: true + paths: + - ./habitat-sim + - save_cache: + key: ccache-{{ arch }}-master + background: true + paths: + - /home/circleci/.ccache + - run: + name: Install api + no_output_timeout: 20m + command: | + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; cd habitat-lab + while [ ! -f ~/miniconda/pytorch_installed ]; do sleep 2; done # wait for Pytorch + ln -s ../habitat-sim/data data + pip install -r requirements.txt --progress-bar off + touch ~/miniconda/pip_deps_installed + - save_cache: + key: conda-{{ checksum "habitat-lab/.circleci/config.yml" }} + background: true + paths: + - ~/miniconda + - run: + name: Run api tests + no_output_timeout: 30m + command: | + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; cd habitat-lab + python setup.py develop --all + export PYTHONPATH=.:$PYTHONPATH + python setup.py test --addopts "--cov-report=xml --cov=./" + + bash <(curl -s https://codecov.io/bash) -f coverage.xml + - run: + name: Build api documentation + command: | + export PATH=$HOME/miniconda/bin:/usr/local/cuda/bin:$PATH + . activate habitat; cd habitat-lab + python setup.py develop --all + + # Download sim inventory for crosslinking (no need to build + # the whole sim docs for that) + # TODO: take it from github.com/facebookmicrosites/habitat-website + # instead + mkdir -p ../habitat-sim/build/docs-public/habitat-sim + curl -s https://aihabitat.org/docs/habitat-sim/objects.inv > ../habitat-sim/build/docs-public/habitat-sim/objects.inv + + cd docs + conda install -y -c conda-forge doxygen==1.8.16 + conda install -y jinja2 pygments docutils + mkdir -p ../build/docs + ./build-public.sh + + +workflows: + version: 2 + install_and_test: + jobs: + - pre-commit + - python_lint + - install_and_test_ubuntu diff --git a/habitat-lab-dialog/.editorconfig b/habitat-lab-dialog/.editorconfig new file mode 100644 index 0000000..86817b2 --- /dev/null +++ b/habitat-lab-dialog/.editorconfig @@ -0,0 +1,22 @@ +# See https://editorconfig.org/ for more info :) + +[*] +charset = utf-8 +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true +insert_final_newline = true + +# isort can't parse [*.{py, rst}], so specifying it separately +# https://github.com/timothycrosley/isort/issues/830 +[*.rst] +indent_size = 4 +[*.py] +indent_size = 4 +max_line_length = 79 +multi_line_output = 3 +force_grid_wrap = false +include_trailing_comma = true +ensure_newline_before_comments=true +use_parentheses = true +known_first_party = habitat,habitat_sim,habitat_baselines,version diff --git a/habitat-lab-dialog/.github/ISSUE_TEMPLATE/bug-report.md b/habitat-lab-dialog/.github/ISSUE_TEMPLATE/bug-report.md new file mode 100644 index 0000000..c41bba3 --- /dev/null +++ b/habitat-lab-dialog/.github/ISSUE_TEMPLATE/bug-report.md @@ -0,0 +1,31 @@ +--- +name: "\U0001F41B Bug Report" +about: Submit a bug report to help us improve Habitat + +--- + +## 🐛 Bug + + + +## Steps to Reproduce + +Steps to reproduce the behavior: + + + +1. +2. +3. + +Please note that without a minimal working example to reproduce the bug, we may not be able to help you. + + + +## Expected behavior + + + +## Additional context + + diff --git a/habitat-lab-dialog/.github/ISSUE_TEMPLATE/feature-request.md b/habitat-lab-dialog/.github/ISSUE_TEMPLATE/feature-request.md new file mode 100644 index 0000000..28c848c --- /dev/null +++ b/habitat-lab-dialog/.github/ISSUE_TEMPLATE/feature-request.md @@ -0,0 +1,24 @@ +--- +name: "\U0001F680Feature Request" +about: Submit a proposal/request for a new Habitat feature + +--- + +## 🚀 Feature + + +## Motivation + + + +## Pitch + + + +## Alternatives + + + +## Additional context + + diff --git a/habitat-lab-dialog/.github/ISSUE_TEMPLATE/questions-help-support.md b/habitat-lab-dialog/.github/ISSUE_TEMPLATE/questions-help-support.md new file mode 100644 index 0000000..992f1b5 --- /dev/null +++ b/habitat-lab-dialog/.github/ISSUE_TEMPLATE/questions-help-support.md @@ -0,0 +1,7 @@ +--- +name: "❓Questions/Help/Support" +about: Do you need support? + +--- + +## ❓ Questions and Help diff --git a/habitat-lab-dialog/.github/PULL_REQUEST_TEMPLATE.md b/habitat-lab-dialog/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..c02c7d8 --- /dev/null +++ b/habitat-lab-dialog/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,29 @@ +## Motivation and Context + + + + + +## How Has This Been Tested + + + +## Types of changes + + +- Docs change / refactoring / dependency upgrade +- Bug fix (non-breaking change which fixes an issue) +- New feature (non-breaking change which adds functionality) +- Breaking change (fix or feature that would cause existing functionality to change) + +## Checklist + + + +- [ ] My code follows the code style of this project. +- [ ] My change requires a change to the documentation. +- [ ] I have updated the documentation accordingly. +- [ ] I have read the **CONTRIBUTING** document. +- [ ] I have completed my CLA (see **CONTRIBUTING**) +- [ ] I have added tests to cover my changes. +- [ ] All new and existing tests passed. diff --git a/habitat-lab-dialog/.gitignore b/habitat-lab-dialog/.gitignore new file mode 100644 index 0000000..d15c835 --- /dev/null +++ b/habitat-lab-dialog/.gitignore @@ -0,0 +1,89 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +*/env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +examples/images + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# DotEnv configuration +.env + +# Database +*.db +*.rdb + +# Pycharm +.idea + +# VS Code +.vscode/ + +# Spyder +.spyproject/ + +# Jupyter NB Checkpoints +.ipynb_checkpoints/ + +# exclude data from source control by default +data + +# Mac OS-specific storage files +.DS_Store + +# mypy +.mypy_cache/ + +# vim +*.swp diff --git a/habitat-lab-dialog/.pre-commit-config.yaml b/habitat-lab-dialog/.pre-commit-config.yaml new file mode 100644 index 0000000..3482ed7 --- /dev/null +++ b/habitat-lab-dialog/.pre-commit-config.yaml @@ -0,0 +1,96 @@ +exclude: 'build|src/deps|src/obsolete' + +default_language_version: + python: python3 + +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v3.4.0 + hooks: + - id: trailing-whitespace + - id: check-added-large-files + args: ['--maxkb=2000'] + - id: end-of-file-fixer + - id: debug-statements + - id: check-case-conflict + - id: check-docstring-first + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-toml + - id: check-yaml + exclude: "habitat_baselines/slambased/data/" + - id: mixed-line-ending + args: ['--fix=lf'] + +- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.0.0 + hooks: + - id: pretty-format-ini + args: [--autofix] + - id: pretty-format-toml + args: [--autofix] + +- repo: https://github.com/timothycrosley/isort + rev: 5.7.0 + hooks: + - id: isort + exclude: docs/ + additional_dependencies: [toml] + +- repo: https://github.com/ambv/black + rev: 20.8b1 + hooks: + - id: black + exclude: ^examples/tutorials/(nb_python|colabs) + +- repo: https://github.com/myint/autoflake + rev: v1.4 + hooks: + - id: autoflake + args: ['--expand-star-imports', '--ignore-init-module-imports', '--in-place'] + exclude: docs/ + +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.8.4 + hooks: + - id: flake8 + exclude: docs/ + additional_dependencies: + - flake8-builtins + - flake8-bugbear + - flake8-comprehensions + - flake8-return + - flake8-simplify + +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v0.800 + hooks: + - id: mypy + pass_filenames: false + +- repo: https://github.com/kynan/nbstripout + rev: 0.3.9 + hooks: + - id: nbstripout + files: ".ipynb" + +- repo: local + hooks: + - id: jupytext-sync + name: Sync scripts and notebooks + files: '^examples/tutorials/(colabs|nb_python)/(.*\.py|.*\.ipynb)$' + entry: jupytext --update-metadata '{"jupytext":{"notebook_metadata_filter":"all", "cell_metadata_filter":"-all"}, "accelerator":"GPU"}' --set-formats 'nb_python//py:percent,colabs//ipynb' --pipe black --pipe "sed s/[[:space:]]*\#[[:space:]]\%\%/\#\%\%/g" --pipe 'isort -' --pipe-fmt 'py:percent' --sync + pass_filenames: true + additional_dependencies: + - 'jupytext==1.5.2' + - 'nbformat<=5.0.8' + - 'black==20.8b1' + - 'isort==5.4.2' + always_run: false + language: python + +- repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.7.1.1 + hooks: + - id: shellcheck + exclude: ^habitat_baselines/slambased/ diff --git a/habitat-lab-dialog/CODE_OF_CONDUCT.md b/habitat-lab-dialog/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..0f7ad8b --- /dev/null +++ b/habitat-lab-dialog/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +# Code of Conduct + +Facebook has adopted a Code of Conduct that we expect project participants to adhere to. +Please read the [full text](https://code.fb.com/codeofconduct/) +so that you can understand what actions will and will not be tolerated. diff --git a/habitat-lab-dialog/CONTRIBUTING.md b/habitat-lab-dialog/CONTRIBUTING.md new file mode 100644 index 0000000..ad6c00f --- /dev/null +++ b/habitat-lab-dialog/CONTRIBUTING.md @@ -0,0 +1,60 @@ +# Contributing to habitat-lab +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `master`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). +7. We have adopted squash-and-merge as the policy for incorporating PRs into the master branch. We encourage more smaller/focused PRs rather than big PRs with many independent changes. This also enables faster development by merging PRs into master quickly and reducing the need to rebase due to changes on master. + + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Facebook's open source projects. + +Complete your CLA here: + +## Versioning / release workflow +We use [semantic versioning](https://semver.org/). To prepare a release: +1. Update version numbers. +2. Update the change log. +3. Make sure all tests are passing. +4. Create a release tag with change log summary using the github release interface (release tag should follow semantic versioning as described above) + +Stable versions are regularly assigned by Habitat core team after rigorous testing. + +## Issues +We use [GitHub issues](../../issues) to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +## Test +We use pytest testing framework and testing data that needs to be downloaded, please make sure that test are passing: +``` +pytest +``` + +## Check typing +We use mypy to check Python typing and guard API consistency, please make sure next command doesn't complain prior to submission: +``` +mypy . --ignore-missing-imports +``` + +## Coding Style + - We follow PEP8 and use [typing](https://docs.python.org/3/library/typing.html). + - Use `black` for style enforcement and linting. Install black through `pip install black`. + + We also use pre-commit hooks to ensure linting and style enforcement. Install the pre-commit hooks with `pip install pre-commit && pre-commit install`. + +## Documentation +- Our documentation style is based on Magnum / Corrade and uses [a similar build system](https://mcss.mosra.cz/documentation/doxygen/). +- Documentation of PRs is highly encouraged! + +## License +By contributing to habitat-lab, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. diff --git a/habitat-lab-dialog/Dockerfile b/habitat-lab-dialog/Dockerfile new file mode 100644 index 0000000..bf5137c --- /dev/null +++ b/habitat-lab-dialog/Dockerfile @@ -0,0 +1,53 @@ +# Base image +FROM nvidia/cudagl:10.1-devel-ubuntu16.04 + +# Setup basic packages +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + git \ + curl \ + vim \ + ca-certificates \ + libjpeg-dev \ + libpng-dev \ + libglfw3-dev \ + libglm-dev \ + libx11-dev \ + libomp-dev \ + libegl1-mesa-dev \ + pkg-config \ + wget \ + zip \ + unzip &&\ + rm -rf /var/lib/apt/lists/* + +# Install conda +RUN curl -o ~/miniconda.sh -O https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh &&\ + chmod +x ~/miniconda.sh &&\ + ~/miniconda.sh -b -p /opt/conda &&\ + rm ~/miniconda.sh &&\ + /opt/conda/bin/conda install numpy pyyaml scipy ipython mkl mkl-include &&\ + /opt/conda/bin/conda clean -ya +ENV PATH /opt/conda/bin:$PATH + +# Install cmake +RUN wget https://github.com/Kitware/CMake/releases/download/v3.14.0/cmake-3.14.0-Linux-x86_64.sh +RUN mkdir /opt/cmake +RUN sh /cmake-3.14.0-Linux-x86_64.sh --prefix=/opt/cmake --skip-license +RUN ln -s /opt/cmake/bin/cmake /usr/local/bin/cmake +RUN cmake --version + +# Conda environment +RUN conda create -n habitat python=3.6 cmake=3.14.0 + +# Setup habitat-sim +RUN git clone --branch stable https://github.com/facebookresearch/habitat-sim.git +RUN /bin/bash -c ". activate habitat; cd habitat-sim; pip install -r requirements.txt; python setup.py install --headless" + +# Install challenge specific habitat-lab +RUN git clone --branch stable https://github.com/facebookresearch/habitat-lab.git +RUN /bin/bash -c ". activate habitat; cd habitat-lab; pip install -e ." + +# Silence habitat-sim logs +ENV GLOG_minloglevel=2 +ENV MAGNUM_LOG="quiet" diff --git a/habitat-lab-dialog/LICENSE b/habitat-lab-dialog/LICENSE new file mode 100644 index 0000000..b96dcb0 --- /dev/null +++ b/habitat-lab-dialog/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Facebook, Inc. and its affiliates. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/habitat-lab-dialog/MANIFEST.in b/habitat-lab-dialog/MANIFEST.in new file mode 100644 index 0000000..85d1274 --- /dev/null +++ b/habitat-lab-dialog/MANIFEST.in @@ -0,0 +1,3 @@ +graft habitat/utils/visualizations/assets +include habitat/py.typed +include habitat_baselines/py.typed diff --git a/habitat-lab-dialog/README.md b/habitat-lab-dialog/README.md new file mode 100644 index 0000000..fda28d8 --- /dev/null +++ b/habitat-lab-dialog/README.md @@ -0,0 +1,209 @@ +[![CircleCI](https://circleci.com/gh/facebookresearch/habitat-lab.svg?style=shield)](https://circleci.com/gh/facebookresearch/habitat-lab) +[![codecov](https://codecov.io/gh/facebookresearch/habitat-lab/branch/master/graph/badge.svg)](https://codecov.io/gh/facebookresearch/habitat-lab) +[![GitHub license](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/facebookresearch/habitat-lab/blob/master/LICENSE) +[![GitHub release (latest by date)](https://img.shields.io/github/v/release/facebookresearch/habitat-lab)](https://github.com/facebookresearch/habitat-lab/releases/latest) +[![Supports Habitat_Sim](https://img.shields.io/static/v1?label=supports&message=Habitat%20Sim&color=informational&link=https://github.com/facebookresearch/habitat-sim)](https://github.com/facebookresearch/habitat-sim) +[![Python 3.6](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://timothycrosley.github.io/isort/) +[![Slack Join](http://img.shields.io/static/v1?label=Join%20us%20on&message=%23habitat-dev&labelColor=%234A154B&logo=slack)](https://join.slack.com/t/ai-habitat/shared_invite/enQtNjY1MzM1NDE4MTk2LTZhMzdmYWMwODZlNjg5MjZiZjExOTBjOTg5MmRiZTVhOWQyNzk0OTMyN2E1ZTEzZTNjMWM0MjBkN2VhMjQxMDI) +[![Twitter Follow](https://img.shields.io/twitter/follow/ai_habitat?style=social)](https://twitter.com/ai_habitat) + +Habitat Lab Dialog +============================== + +Modified version of Habitat Lab to adapt it for Neurips 2022 paper 'AVLEN: Audio-Visual-Language Embodied Navigation in 3D Environments' by Sudipta Paul, Amit K. Roy-Chowdhury, and Anoop Cherian. + + +Habitat Lab +============================== + +Habitat Lab is a modular high-level library for end-to-end development in embodied AI -- +defining embodied AI tasks (e.g. navigation, instruction following, question answering), configuring embodied agents (physical form, sensors, capabilities), training these agents (via imitation or reinforcement learning, or no learning at all as in classical SLAM), and benchmarking their performance on the defined tasks using standard metrics. + +Habitat Lab currently uses [`Habitat-Sim`](https://github.com/facebookresearch/habitat-sim) as the core simulator, but is designed with a modular abstraction for the simulator backend to maintain compatibility over multiple simulators. For documentation refer [here](https://aihabitat.org/docs/habitat-lab/). + +We also have a dev slack channel, please follow this [link](https://join.slack.com/t/ai-habitat/shared_invite/enQtNjY1MzM1NDE4MTk2LTZhMzdmYWMwODZlNjg5MjZiZjExOTBjOTg5MmRiZTVhOWQyNzk0OTMyN2E1ZTEzZTNjMWM0MjBkN2VhMjQxMDI) to get added to the channel. If you want to contribute PRs or face issues with habitat please reach out to us either through github issues or slack channel. + +[![Habitat Demo](https://img.shields.io/static/v1?label=WebGL&message=Try%20AI%20Habitat%20In%20Your%20Browser%20&color=blue&logo=webgl&labelColor=%23990000&style=for-the-badge&link=https://aihabitat.org/demo)](https://aihabitat.org/demo) +

+ +

+ +--- + +## Table of contents + 1. [Motivation](#motivation) + 1. [Citing Habitat](#citing-habitat) + 1. [Installation](#installation) + 1. [Example](#example) + 1. [Documentation](#documentation) + 1. [Docker Setup](#docker-setup) + 1. [Details](#details) + 1. [Data](#data) + 1. [Baselines](#baselines) + 1. [License](#license) + 1. [Acknowledgments](#acknowledgments) + 1. [References](#references-and-citation) + +## Motivation +While there has been significant progress in the vision and language communities thanks to recent advances in deep representations, we believe there is a growing disconnect between ‘internet AI’ and embodied AI. The focus of the former is pattern recognition in images, videos, and text on datasets typically curated from the internet. The focus of the latter is to enable action by an embodied agent in an environment (e.g. a robot). This brings to the forefront issues of active perception, long-term planning, learning from interaction, and holding a dialog grounded in an environment. + +To this end, we aim to standardize the entire ‘software stack’ for training embodied agents – scanning the world and creating highly photorealistic 3D assets, developing the next generation of highly efficient and parallelizable simulators, specifying embodied AI tasks that enable us to benchmark scientific progress, and releasing modular high-level libraries to train and deploy embodied agents. + +## Citing Habitat +If you use the Habitat platform in your research, please cite the following [paper](https://arxiv.org/abs/1904.01201): + +``` +@inproceedings{habitat19iccv, + title = {Habitat: {A} {P}latform for {E}mbodied {AI} {R}esearch}, + author = {Manolis Savva and Abhishek Kadian and Oleksandr Maksymets and Yili Zhao and Erik Wijmans and Bhavana Jain and Julian Straub and Jia Liu and Vladlen Koltun and Jitendra Malik and Devi Parikh and Dhruv Batra}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + year = {2019} +} +``` + +## Installation + +1. Clone a stable version from the github repository and install habitat-lab using the commands below. Note that python>=3.6 is required for working with habitat-lab. All the development and testing was done using python3.6. Please use 3.6 to avoid possible issues. + + ```bash + git clone --branch stable https://github.com/facebookresearch/habitat-lab.git + cd habitat-lab + pip install -e . + ``` + + The command above will install only core of Habitat Lab. To include habitat_baselines along with all additional requirements, use the command below instead: + + ```bash + git clone --branch stable https://github.com/facebookresearch/habitat-lab.git + cd habitat-lab + pip install -r requirements.txt + python setup.py develop --all # install habitat and habitat_baselines + ``` + +2. Install `habitat-sim` from [github repo](https://github.com/facebookresearch/habitat-sim). + +3. Download the [test scenes data](http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes.zip) and extract `data` folder in zip to `habitat-lab/data/` where `habitat-lab/` is the github repository folder. + +4. Run the example script `python examples/example.py ` which in the end should print out number of steps agent took inside an environment (eg: `Episode finished after 2 steps.`). To verify that tests pass run `python setup.py test` which should print out a log about passed, skipped and failed tests. + +5. Run `python examples/benchmark.py` to evaluate a forward only agent in a test environment downloaded in step-3. + +## Example + + +Example code-snippet which uses [`tasks/pointnav.yaml`](configs/tasks/pointnav.yaml) for configuration of task and agent. + +```python +import habitat + +# Load embodied AI task (PointNav) and a pre-specified virtual robot +env = habitat.Env( + config=habitat.get_config("configs/tasks/pointnav.yaml") +) + +observations = env.reset() + +# Step through environment with random actions +while not env.episode_over: + observations = env.step(env.action_space.sample()) + +``` + +See [`examples/register_new_sensors_and_measures.py`](examples/register_new_sensors_and_measures) for an example of how to extend habitat-lab from _outside_ the source code + +## Documentation + +Habitat Lab documentation is available [here](https://aihabitat.org/docs/habitat-lab/index.html). + +For example, see [this page](https://aihabitat.org/docs/habitat-lab/quickstart.html) for a quickstart example. + + +## Docker Setup +We also provide a docker setup for habitat. This works on machines with an NVIDIA GPU and requires users to install [nvidia-docker](https://github.com/NVIDIA/nvidia-docker). The following [Dockerfile](Dockerfile) was used to build the habitat docker. To setup the habitat stack using docker follow the below steps: + +1. Pull the habitat docker image: `docker pull fairembodied/habitat:latest` + +1. Start an interactive bash session inside the habitat docker: `docker run --runtime=nvidia -it fairhabitat/habitat:v1` + +1. Activate the habitat conda environment: `source activate habitat` + +1. Benchmark a forward only agent on the test scenes data: `cd habitat-api; python examples/benchmark.py`. This should print out an output like: +```bash +2019-02-25 02:39:48,680 initializing sim Sim-v0 +2019-02-25 02:39:49,655 initializing task Nav-v0 +spl: 0.000 +``` + +## Details +An important objective of Habitat Lab is to make it easy for users to set up a variety of embodied agent tasks in 3D environments. The process of setting up a task involves using environment information provided by the simulator, connecting the information with a dataset (e.g. PointGoal targets, or question and answer pairs for Embodied QA) and providing observations which can be used by the agents. Keeping this primary objective in mind the core API defines the following key concepts as abstractions that can be extended: + +* `Env`: the fundamental environment concept for Habitat. All the information needed for working on embodied tasks with a simulator is abstracted inside an Env. This class acts as a base for other derived environment classes. Env consists of three major components: a Simulator, a Dataset (containing Episodes), and a Task, and it serves to connects all these three components together. + +* `Dataset`: contains a list of task-specific episodes from a particular data split and additional dataset-wide information. Handles loading and saving of a dataset to disk, getting a list of scenes, and getting a list of episodes for a particular scene. + +* `Episode`: a class for episode specification that includes the initial position and orientation of an Agent, a scene id, a goal position and optionally shortest paths to the goal. An episode is a description of one task instance for the agent. + +

+ teaser results +

Architecture of Habitat Lab

+

+ +* `Task`: this class builds on top of the simulator and dataset. The criteria of episode termination and measures of success are provided by the Task. + +* `Sensor`: a generalization of the physical Sensor concept provided by a Simulator, with the capability to provide Task-specific Observation data in a specified format. + +* `Observation`: data representing an observation from a Sensor. This can correspond to physical sensors on an Agent (e.g. RGB, depth, semantic segmentation masks, collision sensors) or more abstract sensors such as the current agent state. + +Note that the core functionality defines fundamental building blocks such as the API for interacting with the simulator backend, and receiving observations through Sensors. Concrete simulation backends, 3D datasets, and embodied agent baselines are implemented on top of the core API. + +## Data +To make things easier we expect `data` folder of particular structure or symlink presented in habitat-lab working directory. + +### Scenes datasets +| Scenes models | Extract path | Archive size | +| --- | --- | --- | +| [Gibson](#Gibson) | `data/scene_datasets/gibson/{scene}.glb` | 1.5 GB | +| [MatterPort3D](#Matterport3D) | `data/scene_datasets/mp3d/{scene}/{scene}.glb` | 15 GB | + +#### Matterport3D +The full Matterport3D (MP3D) dataset for use with Habitat can be downloaded using the official [Matterport3D](https://niessner.github.io/Matterport/) download script as follows: `python download_mp.py --task habitat -o data/scene_datasets/mp3d/`. You only need the habitat zip archive and not the entire Matterport3D dataset. Note that this download script requires python 2.7 to run. Extract the matterport data to `data/scene_datasets/mp3d`. + +#### Gibson +Download the Habitat related Gibson dataset following the instructions [here](https://github.com/StanfordVL/GibsonEnv#database). After downloading extract the dataset to folder `habitat-lab/data/scene_datasets/gibson/` folder (this folder should contain the `.glb` files from Gibson). + + +### Task datasets +| Task | Scenes | Link | Extract path | Config to use | Archive size | +| --- | --- | --- | --- | --- | --- | +| [Point goal navigation](https://arxiv.org/abs/1807.06757) | Gibson | [pointnav_gibson_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/gibson/v1/pointnav_gibson_v1.zip) | `data/datasets/pointnav/gibson/v1/` | [`datasets/pointnav/gibson.yaml`](configs/datasets/pointnav/gibson.yaml) | 385 MB | +| [Point goal navigation corresponding to Sim2LoCoBot experiment configuration](https://arxiv.org/abs/1912.06321) | Gibson | [pointnav_gibson_v2.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/gibson/v2/pointnav_gibson_v2.zip) | `data/datasets/pointnav/gibson/v2/` | [`datasets/pointnav/gibson_v2.yaml`](configs/datasets/pointnav/gibson.yaml) | 274 MB | +| [Point goal navigation](https://arxiv.org/abs/1807.06757) | MatterPort3D | [pointnav_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/mp3d/v1/pointnav_mp3d_v1.zip) | `data/datasets/pointnav/mp3d/v1/` | [`datasets/pointnav/mp3d.yaml`](configs/datasets/pointnav/mp3d.yaml) | 400 MB | +| Object goal navigation | MatterPort3D | [objectnav_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/objectnav/m3d/v1/objectnav_mp3d_v1.zip) | `data/datasets/objectnav/mp3d/v1/` | [`datasets/objectnav/mp3d.yaml`](configs/datasets/objectnav/mp3d.yaml) | 170 MB | +| [Embodied Question Answering](https://embodiedqa.org/) | MatterPort3D | [eqa_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/eqa/mp3d/v1/eqa_mp3d_v1.zip) | `data/datasets/eqa/mp3d/v1/` | [`datasets/eqa/mp3d.yaml`](configs/datasets/eqa/mp3d.yaml) | 44 MB | +| [Visual Language Navigation](https://bringmeaspoon.org/) | MatterPort3D | [vln_r2r_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/vln/mp3d/r2r/v1/vln_r2r_mp3d_v1.zip) | `data/datasets/vln/mp3d/r2r/v1` | [`datasets/vln/mp3d_r2r.yaml`](configs/datasets/vln/mp3d_r2r.yaml) | 2.7 MB | +| [Image goal navigation](https://github.com/facebookresearch/habitat-lab/pull/333) | Gibson | [pointnav_gibson_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/gibson/v1/pointnav_gibson_v1.zip) | `data/datasets/pointnav/gibson/v1/` | [`datasets/imagenav/gibson.yaml`](configs/datasets/imagenav/gibson.yaml) | 385 MB | +| [Image goal navigation](https://github.com/facebookresearch/habitat-lab/pull/333) | MatterPort3D | [pointnav_mp3d_v1.zip](https://dl.fbaipublicfiles.com/habitat/data/datasets/pointnav/mp3d/v1/pointnav_mp3d_v1.zip) | `data/datasets/pointnav/mp3d/v1/` | [`datasets/imagenav/mp3d.yaml`](configs/datasets/imagenav/mp3d.yaml) | 400 MB | + +To use an episode dataset provide related config to the Env in [the example](#example) or use the config for [RL agent training](habitat_baselines/README.md#reinforcement-learning-rl). + +## Baselines +Habitat Lab includes reinforcement learning (via PPO) and classical SLAM based baselines. For running PPO training on sample data and more details refer [habitat_baselines/README.md](habitat_baselines/README.md). + +## Habitat-PyRobot +Habitat Lab supports deployment of models on a physical robot through PyRobot (https://github.com/facebookresearch/pyrobot). Please install the python3 version of PyRobot and refer to `habitat.sims.pyrobot.pyrobot` for instructions. This functionality allows deployment of models across simulation and reality. + +## Acknowledgments +The Habitat project would not have been possible without the support and contributions of many individuals. We would like to thank Dmytro Mishkin, Xinlei Chen, Georgia Gkioxari, Daniel Gordon, Leonidas Guibas, Saurabh Gupta, Or Litany, Marcus Rohrbach, Amanpreet Singh, Devendra Singh Chaplot, Yuandong Tian, and Yuxin Wu for many helpful conversations and guidance on the design and development of the Habitat platform. + +## License +Habitat Lab is MIT licensed. See the [LICENSE file](habitat_baselines/LICENSE) for details. + +The trained models and the task datasets are considered data derived from the correspondent scene datasets. +- Matterport3D based task datasets and trained models are distributed with [Matterport3D Terms of Use](http://kaldir.vc.in.tum.de/matterport/MP_TOS.pdf) and under [CC BY-NC-SA 3.0 US license](https://creativecommons.org/licenses/by-nc-sa/3.0/us/). +- Gibson based task datasets, the code for generating such datasets, and trained models are distributed with [Gibson Terms of Use](https://storage.googleapis.com/gibson_material/Agreement%20GDS%2006-04-18.pdf) and under [CC BY-NC-SA 3.0 US license](https://creativecommons.org/licenses/by-nc-sa/3.0/us/). + +## References and Citation +1. [Habitat: A Platform for Embodied AI Research](https://arxiv.org/abs/1904.01201). Manolis Savva, Abhishek Kadian, Oleksandr Maksymets, Yili Zhao, Erik Wijmans, Bhavana Jain, Julian Straub, Jia Liu, Vladlen Koltun, Jitendra Malik, Devi Parikh, Dhruv Batra. IEEE/CVF International Conference on Computer Vision (ICCV), 2019. diff --git a/habitat-lab-dialog/configs/baselines/ppo.yaml b/habitat-lab-dialog/configs/baselines/ppo.yaml new file mode 100644 index 0000000..839df23 --- /dev/null +++ b/habitat-lab-dialog/configs/baselines/ppo.yaml @@ -0,0 +1,4 @@ +TRAINER: + RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 diff --git a/habitat-lab-dialog/configs/datasets/eqa/mp3d.yaml b/habitat-lab-dialog/configs/datasets/eqa/mp3d.yaml new file mode 100644 index 0000000..98e16d4 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/eqa/mp3d.yaml @@ -0,0 +1,5 @@ +DATASET: + TYPE: MP3DEQA-v1 + SPLIT: train + DATA_PATH: "data/datasets/eqa/mp3d/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/datasets/imagenav/gibson.yaml b/habitat-lab-dialog/configs/datasets/imagenav/gibson.yaml new file mode 100644 index 0000000..7993233 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/imagenav/gibson.yaml @@ -0,0 +1,4 @@ +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/gibson/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/datasets/imagenav/mp3d.yaml b/habitat-lab-dialog/configs/datasets/imagenav/mp3d.yaml new file mode 100644 index 0000000..d0822b0 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/imagenav/mp3d.yaml @@ -0,0 +1,4 @@ +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/mp3d/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/datasets/objectnav/mp3d.yaml b/habitat-lab-dialog/configs/datasets/objectnav/mp3d.yaml new file mode 100644 index 0000000..f9dd2b6 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/objectnav/mp3d.yaml @@ -0,0 +1,4 @@ +DATASET: + TYPE: ObjectNav-v1 + SPLIT: train + DATA_PATH: data/datasets/objectnav/mp3d/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/datasets/pointnav/gibson.yaml b/habitat-lab-dialog/configs/datasets/pointnav/gibson.yaml new file mode 100644 index 0000000..7993233 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/pointnav/gibson.yaml @@ -0,0 +1,4 @@ +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/gibson/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/datasets/pointnav/gibson_v2.yaml b/habitat-lab-dialog/configs/datasets/pointnav/gibson_v2.yaml new file mode 100644 index 0000000..1d997a3 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/pointnav/gibson_v2.yaml @@ -0,0 +1,11 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + AGENT_0: + HEIGHT: 0.88 + RADIUS: 0.18 + +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/gibson/v2/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/datasets/pointnav/habitat_test.yaml b/habitat-lab-dialog/configs/datasets/pointnav/habitat_test.yaml new file mode 100644 index 0000000..cf07c2f --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/pointnav/habitat_test.yaml @@ -0,0 +1,4 @@ +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/habitat-test-scenes/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/datasets/pointnav/mp3d.yaml b/habitat-lab-dialog/configs/datasets/pointnav/mp3d.yaml new file mode 100644 index 0000000..d0822b0 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/pointnav/mp3d.yaml @@ -0,0 +1,4 @@ +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/mp3d/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/datasets/single_episode.yaml b/habitat-lab-dialog/configs/datasets/single_episode.yaml new file mode 100644 index 0000000..cfff4d7 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/single_episode.yaml @@ -0,0 +1,5 @@ +ENVIRONMENT: + ITERATOR_OPTIONS: + GROUP_BY_SCENE: True + NUM_EPISODE_SAMPLE: 1 + SHUFFLE: False diff --git a/habitat-lab-dialog/configs/datasets/vln/mp3d_r2r.yaml b/habitat-lab-dialog/configs/datasets/vln/mp3d_r2r.yaml new file mode 100644 index 0000000..b373498 --- /dev/null +++ b/habitat-lab-dialog/configs/datasets/vln/mp3d_r2r.yaml @@ -0,0 +1,5 @@ +DATASET: + TYPE: R2RVLN-v1 + SPLIT: train + DATA_PATH: "data/datasets/vln/mp3d/r2r/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/tasks/eqa_mp3d.yaml b/habitat-lab-dialog/configs/tasks/eqa_mp3d.yaml new file mode 100644 index 0000000..5ccfddc --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/eqa_mp3d.yaml @@ -0,0 +1,29 @@ +TASK: + TYPE: EQA-v0 + SENSORS: ['QUESTION_SENSOR'] + POSSIBLE_ACTIONS: ['MOVE_FORWARD', 'TURN_LEFT', 'TURN_RIGHT', 'ANSWER'] + MEASUREMENTS: ['EPISODE_INFO', 'DISTANCE_TO_GOAL', 'ANSWER_ACCURACY'] + +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 + +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR', 'SEMANTIC_SENSOR', 'DEPTH_SENSOR'] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 + SEMANTIC_SENSOR: + WIDTH: 256 + HEIGHT: 256 + +DATASET: + TYPE: MP3DEQA-v1 + SPLIT: train + DATA_PATH: "data/datasets/eqa/mp3d/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/tasks/imagenav.yaml b/habitat-lab-dialog/configs/tasks/imagenav.yaml new file mode 100644 index 0000000..4d6689e --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/imagenav.yaml @@ -0,0 +1,24 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 1000 + +SIMULATOR: + AGENT_0: + SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 + +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 1. + + SENSORS: ['IMAGEGOAL_SENSOR'] + + MEASUREMENTS: ["DISTANCE_TO_GOAL", "SUCCESS", "SPL", "SOFT_SPL"] + SUCCESS: + SUCCESS_DISTANCE: 1. diff --git a/habitat-lab-dialog/configs/tasks/imagenav_gibson.yaml b/habitat-lab-dialog/configs/tasks/imagenav_gibson.yaml new file mode 100644 index 0000000..3766de9 --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/imagenav_gibson.yaml @@ -0,0 +1,29 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 1000 + ITERATOR_OPTIONS: + MAX_SCENE_REPEAT_STEPS: 50000 +SIMULATOR: + AGENT_0: + SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 1. + + SENSORS: ['IMAGEGOAL_SENSOR'] + + MEASUREMENTS: ["DISTANCE_TO_GOAL", "SUCCESS", "SPL", "SOFT_SPL"] + SUCCESS: + SUCCESS_DISTANCE: 1. + +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/gibson/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/tasks/objectnav_mp3d.yaml b/habitat-lab-dialog/configs/tasks/objectnav_mp3d.yaml new file mode 100644 index 0000000..86f6f9b --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/objectnav_mp3d.yaml @@ -0,0 +1,51 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 + +SIMULATOR: + TURN_ANGLE: 30 + TILT_ANGLE: 30 + ACTION_SPACE_CONFIG: "v1" + AGENT_0: + SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] + HEIGHT: 0.88 + RADIUS: 0.18 + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + ALLOW_SLIDING: False + SEMANTIC_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + POSITION: [0, 0.88, 0] + RGB_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + POSITION: [0, 0.88, 0] + DEPTH_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + MIN_DEPTH: 0.5 + MAX_DEPTH: 5.0 + POSITION: [0, 0.88, 0] +TASK: + TYPE: ObjectNav-v1 + POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"] + SUCCESS_DISTANCE: 0.1 + + SENSORS: ['OBJECTGOAL_SENSOR', 'COMPASS_SENSOR', 'GPS_SENSOR'] + GOAL_SENSOR_UUID: objectgoal + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL'] + + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + SUCCESS: + SUCCESS_DISTANCE: 0.1 + +DATASET: + TYPE: ObjectNav-v1 + SPLIT: train + DATA_PATH: "data/datasets/objectnav/mp3d/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/tasks/pointnav.yaml b/habitat-lab-dialog/configs/tasks/pointnav.yaml new file mode 100644 index 0000000..9c9b3f2 --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/pointnav.yaml @@ -0,0 +1,26 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR'] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] + SUCCESS: + SUCCESS_DISTANCE: 0.2 diff --git a/habitat-lab-dialog/configs/tasks/pointnav_gibson.yaml b/habitat-lab-dialog/configs/tasks/pointnav_gibson.yaml new file mode 100644 index 0000000..3cffe68 --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/pointnav_gibson.yaml @@ -0,0 +1,31 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR'] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] + SUCCESS: + SUCCESS_DISTANCE: 0.2 + +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/gibson/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/tasks/pointnav_mp3d.yaml b/habitat-lab-dialog/configs/tasks/pointnav_mp3d.yaml new file mode 100644 index 0000000..1fdac8e --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/pointnav_mp3d.yaml @@ -0,0 +1,30 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR'] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] + SUCCESS: + SUCCESS_DISTANCE: 0.2 +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/mp3d/v1/{split}/{split}.json.gz diff --git a/habitat-lab-dialog/configs/tasks/pointnav_rgbd.yaml b/habitat-lab-dialog/configs/tasks/pointnav_rgbd.yaml new file mode 100644 index 0000000..70b268b --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/pointnav_rgbd.yaml @@ -0,0 +1,26 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 0.2 + + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] + SUCCESS: + SUCCESS_DISTANCE: 0.2 diff --git a/habitat-lab-dialog/configs/tasks/vln_r2r.yaml b/habitat-lab-dialog/configs/tasks/vln_r2r.yaml new file mode 100644 index 0000000..a53d510 --- /dev/null +++ b/habitat-lab-dialog/configs/tasks/vln_r2r.yaml @@ -0,0 +1,31 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] + FORWARD_STEP_SIZE: 0.25 + TURN_ANGLE: 15 + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + HFOV: 90 + TYPE: HabitatSimRGBSensor + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: VLN-v0 + SUCCESS_DISTANCE: 3.0 + SENSORS: ['INSTRUCTION_SENSOR'] + INSTRUCTION_SENSOR_UUID: instruction + POSSIBLE_ACTIONS: ['STOP', 'MOVE_FORWARD', 'TURN_LEFT', 'TURN_RIGHT'] + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCESS', 'SPL'] + SUCESS: + SUCCESS_DISTANCE: 3.0 +DATASET: + TYPE: R2RVLN-v1 + SPLIT: train + DATA_PATH: "data/datasets/vln/mp3d/r2r/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/test/habitat_all_sensors_test.yaml b/habitat-lab-dialog/configs/test/habitat_all_sensors_test.yaml new file mode 100644 index 0000000..30e49e2 --- /dev/null +++ b/habitat-lab-dialog/configs/test/habitat_all_sensors_test.yaml @@ -0,0 +1,30 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 10 + ITERATOR_OPTIONS: + SHUFFLE: False +SIMULATOR: + AGENT_0: + SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +DATASET: + TYPE: PointNav-v1 + SPLIT: train + DATA_PATH: data/datasets/pointnav/habitat-test-scenes/v1/{split}/{split}.json.gz +TASK: + TYPE: Nav-v0 + SUCCESS_DISTANCE: 0.2 + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR'] + POSSIBLE_ACTIONS: ['STOP', 'MOVE_FORWARD', 'TURN_LEFT', 'TURN_RIGHT', 'TELEPORT'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL', 'SOFT_SPL'] + SUCCESS: + SUCCESS_DISTANCE: 0.2 diff --git a/habitat-lab-dialog/configs/test/habitat_mp3d_eqa_test.yaml b/habitat-lab-dialog/configs/test/habitat_mp3d_eqa_test.yaml new file mode 100644 index 0000000..6242e6e --- /dev/null +++ b/habitat-lab-dialog/configs/test/habitat_mp3d_eqa_test.yaml @@ -0,0 +1,33 @@ +TASK: + TYPE: EQA-v0 + SENSORS: ['QUESTION_SENSOR'] + POSSIBLE_ACTIONS: ['MOVE_FORWARD', 'TURN_LEFT', 'TURN_RIGHT', 'ANSWER'] + MEASUREMENTS: ['EPISODE_INFO', 'DISTANCE_TO_GOAL', 'ANSWER_ACCURACY'] + +ENVIRONMENT: + ITERATOR_OPTIONS: + SHUFFLE: False + +SIMULATOR: + SCENE: data/scene_datasets/mp3d/17DRP5sb8fy/17DRP5sb8fy.glb + FORWARD_STEP_SIZE: 0.1 + TURN_ANGLE: 9 + AGENT_0: + ANGULAR_ACCELERATION: 15.7 + ANGULAR_FRICTION: 1.0 + COEFFICIENT_OF_RESTITUTION: 0.15707963267 + LINEAR_ACCELERATION: 10.0 + LINEAR_FRICTION: 1.0 + SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR', 'SEMANTIC_SENSOR'] + RGB_SENSOR: + HEIGHT: 512 + WIDTH: 512 + HFOV: 45 + POSITION: [0, 1.09, 0] + TYPE: HabitatSimRGBSensor + +DATASET: + TYPE: MP3DEQA-v1 + SPLIT: val + DATA_PATH: "data/datasets/eqa/mp3d/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/test/habitat_mp3d_object_nav_test.yaml b/habitat-lab-dialog/configs/test/habitat_mp3d_object_nav_test.yaml new file mode 100644 index 0000000..f89c575 --- /dev/null +++ b/habitat-lab-dialog/configs/test/habitat_mp3d_object_nav_test.yaml @@ -0,0 +1,49 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + TURN_ANGLE: 30 + TILT_ANGLE: 30 + ACTION_SPACE_CONFIG: "v1" + AGENT_0: + SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR'] + HEIGHT: 0.88 + RADIUS: 0.2 + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + SEMANTIC_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + POSITION: [0, 0.88, 0] + RGB_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + POSITION: [0, 0.88, 0] + DEPTH_SENSOR: + WIDTH: 640 + HEIGHT: 480 + HFOV: 79 + MIN_DEPTH: 0.5 + MAX_DEPTH: 5.0 + POSITION: [0, 0.88, 0] +TASK: + TYPE: ObjectNav-v1 + POSSIBLE_ACTIONS: ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT", "LOOK_UP", "LOOK_DOWN"] + SUCCESS_DISTANCE: 0.1 + + SENSORS: ['OBJECTGOAL_SENSOR', 'COMPASS_SENSOR', 'GPS_SENSOR'] + GOAL_SENSOR_UUID: objectgoal + + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] + SUCCESS: + SUCCESS_DISTANCE: 0.2 + DISTANCE_TO_GOAL: + DISTANCE_TO: VIEW_POINTS + +DATASET: + TYPE: ObjectNav-v1 + SPLIT: val + CONTENT_SCENES: ["*"] + DATA_PATH: "data/datasets/objectnav/mp3d/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/test/habitat_r2r_vln_test.yaml b/habitat-lab-dialog/configs/test/habitat_r2r_vln_test.yaml new file mode 100644 index 0000000..7e7f01e --- /dev/null +++ b/habitat-lab-dialog/configs/test/habitat_r2r_vln_test.yaml @@ -0,0 +1,31 @@ +ENVIRONMENT: + MAX_EPISODE_STEPS: 500 +SIMULATOR: + FORWARD_STEP_SIZE: 0.25 + TURN_ANGLE: 15 + HABITAT_SIM_V0: + GPU_DEVICE_ID: 0 + RGB_SENSOR: + WIDTH: 256 + HEIGHT: 256 + HFOV: 90 + TYPE: HabitatSimRGBSensor + DEPTH_SENSOR: + WIDTH: 256 + HEIGHT: 256 +TASK: + TYPE: VLN-v0 + SENSORS: ['POINTGOAL_WITH_GPS_COMPASS_SENSOR', 'INSTRUCTION_SENSOR'] + POINTGOAL_WITH_GPS_COMPASS_SENSOR: + GOAL_FORMAT: "POLAR" + DIMENSIONALITY: 2 + GOAL_SENSOR_UUID: pointgoal_with_gps_compass + INSTRUCTION_SENSOR_UUID: instruction + MEASUREMENTS: ['DISTANCE_TO_GOAL', 'SUCCESS', 'SPL'] + SUCCESS: + SUCCESS_DISTANCE: 3.0 +DATASET: + TYPE: R2RVLN-v1 + SPLIT: val_seen + DATA_PATH: "data/datasets/vln/mp3d/r2r/v1/{split}/{split}.json.gz" + SCENES_DIR: "data/scene_datasets/" diff --git a/habitat-lab-dialog/configs/test/new_keys_test.yaml b/habitat-lab-dialog/configs/test/new_keys_test.yaml new file mode 100644 index 0000000..af9cbb8 --- /dev/null +++ b/habitat-lab-dialog/configs/test/new_keys_test.yaml @@ -0,0 +1,6 @@ +ENVIRONMENT: + NEW_KEY: 20 + ITERATOR_OPTIONS: + MY_PARAM: "test" +TASK: + MY_NEW_TASK_PARAM: test diff --git a/habitat-lab-dialog/docs/.gitignore b/habitat-lab-dialog/docs/.gitignore new file mode 100644 index 0000000..6687855 --- /dev/null +++ b/habitat-lab-dialog/docs/.gitignore @@ -0,0 +1 @@ +m.math.cache diff --git a/habitat-lab-dialog/docs/build-public.sh b/habitat-lab-dialog/docs/build-public.sh new file mode 100755 index 0000000..4a3599c --- /dev/null +++ b/habitat-lab-dialog/docs/build-public.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Propagate failures properly +set -e + +mcss_path=../../habitat-sim/docs/m.css + +# Regenerate the compiled CSS file (yes, in the sim repository, to allow fast +# iterations from here as well) +$mcss_path/css/postprocess.py \ + ../../habitat-sim/docs/theme.css \ + $mcss_path/css/m-grid.css \ + $mcss_path/css/m-components.css \ + $mcss_path/css/m-layout.css \ + ../../habitat-sim/docs/pygments-pastie.css \ + $mcss_path/css/pygments-console.css \ + $mcss_path/css/m-documentation.css \ + -o ../../habitat-sim/docs/theme.compiled.css + +$mcss_path/documentation/python.py conf-public.py + +# The file:// URLs are usually clickable in the terminal, directly opening a +# browser +echo "------------------------------------------------------------------------" +echo "Public docs were successfully generated to the following location. Note" +echo "that the search functionality requires a web server in this case." +echo +echo "file://$(pwd)/../../habitat-sim/build/docs-public/habitat-lab/index.html" diff --git a/habitat-lab-dialog/docs/build.sh b/habitat-lab-dialog/docs/build.sh new file mode 100755 index 0000000..c2f4627 --- /dev/null +++ b/habitat-lab-dialog/docs/build.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Propagate failures properly +set -e + +if [[ $# -eq 1 ]]; then + export mcss_path=$1 +elif [[ $# -ne 0 ]]; then + echo "usage: ./build.sh [path-to-m.css]" + exit 1 +else + if [ ! -d ../../habitat-sim/docs/m.css ]; then + echo "m.css submodule not found in the sim repository, please run git submodule update --init there or specify the path to it" + exit 1 + fi + mcss_path=../../habitat-sim/docs/m.css +fi + +# Regenerate the compiled CSS file (yes, in the sim repository, to allow fast +# iterations from here as well) +$mcss_path/css/postprocess.py \ + ../../habitat-sim/docs/theme.css \ + $mcss_path/css/m-grid.css \ + $mcss_path/css/m-components.css \ + $mcss_path/css/m-layout.css \ + ../../habitat-sim/docs/pygments-pastie.css \ + $mcss_path/css/pygments-console.css \ + $mcss_path/css/m-documentation.css \ + -o ../../habitat-sim/docs/theme.compiled.css + +$mcss_path/documentation/python.py conf.py + +# The file:// URLs are usually clickable in the terminal, directly opening a +# browser +echo "------------------------------------------------------------------------" +echo "Docs were successfully generated. Open the following link to view them:" +echo +echo "file://$(pwd)/../../habitat-sim/build/docs/habitat-lab/index.html" diff --git a/habitat-lab-dialog/docs/conf-public.py b/habitat-lab-dialog/docs/conf-public.py new file mode 100644 index 0000000..a0b11d3 --- /dev/null +++ b/habitat-lab-dialog/docs/conf-public.py @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import sys + +sys.path.append(os.path.dirname(os.path.realpath(__file__))) + +# Inherit everything from the local config +from conf import * # isort:skip + +OUTPUT = "../../habitat-sim/build/docs-public/habitat-lab/" + +SEARCH_DOWNLOAD_BINARY = "searchdata-v1.bin" +SEARCH_BASE_URL = "https://aihabitat.org/docs/habitat-lab/" +SEARCH_EXTERNAL_URL = "https://google.com/search?q=site:aihabitat.org+{query}" + +M_SPHINX_INVENTORIES = [ + ( + "../../habitat-sim/docs/python.inv", + "https://docs.python.org/3/", + [], + ["m-doc-external"], + ), + ( + "../../habitat-sim/docs/numpy.inv", + "https://docs.scipy.org/doc/numpy/", + [], + ["m-doc-external"], + ), + ( + "../../habitat-sim/build/docs-public/habitat-sim/objects.inv", + "../habitat-sim/", + [], + ["m-doc-external"], + ), +] diff --git a/habitat-lab-dialog/docs/conf.py b/habitat-lab-dialog/docs/conf.py new file mode 100644 index 0000000..db9399b --- /dev/null +++ b/habitat-lab-dialog/docs/conf.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import sys + +# TODO make this less brittle +sys.path = [os.path.join(os.path.dirname(__file__), "../")] + sys.path + +import habitat # isort:skip + +# Overrides the __all__ as that one pulls everything into the root module +# and doesn't expose any submodules +habitat.__all__ = ["config", "core", "Agent", "Benchmark"] +habitat.core.__all__ = [ + "env", + "embodied_task", + "dataset", + "simulator", + "registry", + "vector_env", +] +# yacs.config isn't ours, so don't document it +habitat.config.__all__.remove("Config") + +PROJECT_TITLE = "Habitat" +PROJECT_SUBTITLE = "Lab Docs" +PROJECT_LOGO = "../../habitat-sim/docs/habitat.svg" +FAVICON = "../../habitat-sim/docs/habitat-blue.png" +MAIN_PROJECT_URL = "/" +INPUT_MODULES = [habitat] +INPUT_DOCS = ["docs.rst"] +INPUT_PAGES = [ + "pages/index.rst", + "pages/quickstart.rst", + "pages/habitat-sim-demo.rst", + "pages/habitat-lab-demo.rst", + "pages/view-transform-warp.rst", +] + +PLUGINS = [ + "m.abbr", + "m.code", + "m.components", + "m.dox", + "m.gh", + "m.htmlsanity", + "m.images", + "m.link", + "m.math", + "m.sphinx", +] + +CLASS_INDEX_EXPAND_LEVELS = 2 + +PYBIND11_COMPATIBILITY = True +ATTRS_COMPATIBILITY = True + +# Putting output into the sim repository so relative linking works the same +# way as on the website +OUTPUT = "../../habitat-sim/build/docs/habitat-lab/" + +LINKS_NAVBAR1 = [ + ( + "Pages", + "pages", + [ + ("Quickstart", "quickstart"), + ("Habitat Sim Demo", "habitat-sim-demo"), + ("Habitat Lab Demo", "habitat-lab-demo"), + ("View, Transform and Warp", "view-transform-warp"), + ], + ), + ("Classes", "classes", []), +] +LINKS_NAVBAR2 = [ + ("Sim Docs", "../habitat-sim/index.html", []), +] + +FINE_PRINT = f""" +| {PROJECT_TITLE} {PROJECT_SUBTITLE}. Copyright © 2019 Facebook AI Research. +| Created with `m.css Python doc generator `_.""" + +STYLESHEETS = [ + "https://fonts.googleapis.com/css?family=Source+Sans+Pro:400,400i,600,600i%7CSource+Code+Pro:400,400i,600", + "../../habitat-sim/docs/theme.compiled.css", +] + +M_SPHINX_INVENTORIES = [ + ( + "../../habitat-sim/docs/python.inv", + "https://docs.python.org/3/", + [], + ["m-doc-external"], + ), + ( + "../../habitat-sim/docs/numpy.inv", + "https://docs.scipy.org/doc/numpy/", + [], + ["m-doc-external"], + ), + ( + "../../habitat-sim/build/docs/habitat-sim/objects.inv", + "../habitat-sim/", + [], + ["m-doc-external"], + ), +] +M_SPHINX_INVENTORY_OUTPUT = "objects.inv" +M_SPHINX_PARSE_DOCSTRINGS = True + +M_HTMLSANITY_SMART_QUOTES = True +# Will people hate me if I enable this? +# M_HTMLSANITY_HYPHENATION = True diff --git a/habitat-lab-dialog/docs/docs.rst b/habitat-lab-dialog/docs/docs.rst new file mode 100644 index 0000000..490a3d0 --- /dev/null +++ b/habitat-lab-dialog/docs/docs.rst @@ -0,0 +1,15 @@ +.. + Stuff defined here gets set globally for everything else: + + - use :py:`code` for inline code with highlighted Python syntax +.. + +.. role:: py(code) + :language: py + +.. due to current limitations in m.css, all underscored members have to be + listed here in order to be visible, it's not enough to list them in a class + / module docstring. All underscored members are otherwise treated as + private and not exposed in the docs + +.. py:data:: habitat.core.embodied_task.Measure._metric diff --git a/habitat-lab-dialog/docs/pages/habitat-lab-demo.png b/habitat-lab-dialog/docs/pages/habitat-lab-demo.png new file mode 100644 index 0000000..fa60541 Binary files /dev/null and b/habitat-lab-dialog/docs/pages/habitat-lab-demo.png differ diff --git a/habitat-lab-dialog/docs/pages/habitat-lab-demo.rst b/habitat-lab-dialog/docs/pages/habitat-lab-demo.rst new file mode 100644 index 0000000..aaddb40 --- /dev/null +++ b/habitat-lab-dialog/docs/pages/habitat-lab-demo.rst @@ -0,0 +1,172 @@ +Habitat Lab Demo +################ + +.. contents:: + :class: m-block m-default + +.. code:: py + + import habitat + + import numpy as np + import random + + %matplotlib inline + import matplotlib.pyplot as plt + +All the boilerplate code in the habitat-sim to set sensor config and agent +config is abstracted out in the Habitat Lab config system. Default config is at +:gh:`habitat/config/default.py `. +You can override defaults by specifying them in a separate file and pass it to +the :ref:`habitat.config.get_config()` function or defrost the config object, +override parameters and freeze the config. + +.. code-figure:: + + .. code:: py + + config = habitat.get_config(config_paths='../configs/tasks/pointnav_mp3d.yaml') + config.defrost() + config.DATASET.DATA_PATH = '../data/datasets/pointnav/mp3d/v1/val/val.json.gz' + config.DATASET.SCENES_DIR = '../data/scene_datasets/' + config.freeze() + + env = habitat.Env(config=config) + + .. code:: shell-session + :class: m-nopad + + 2019-06-06 16:11:35,200 initializing sim Sim-v0 + 2019-06-06 16:11:46,171 initializing task Nav-v0 + +`Scene semantic annotations`_ +============================= + +.. code-figure:: + + .. code:: py + + def print_scene_recur(scene, limit_output=10): + count = 0 + for level in scene.levels: + print( + f"Level id:{level.id}, center:{level.aabb.center}," + f" dims:{level.aabb.sizes}" + ) + for region in level.regions: + print( + f"Region id:{region.id}, category:{region.category.name()}," + f" center:{region.aabb.center}, dims:{region.aabb.sizes}" + ) + for obj in region.objects: + print( + f"Object id:{obj.id}, category:{obj.category.name()}," + f" center:{obj.aabb.center}, dims:{obj.aabb.sizes}" + ) + count += 1 + if count >= limit_output: + return None + + # Print semantic annotation information (id, category, bounding box details) + # for the current scene in a hierarchical fashion + scene = env.sim.semantic_annotations() + print_scene_recur(scene, limit_output=15) + + env.close() + # Note: Since only one OpenGL is allowed per process, + # you have to close the current env before instantiating a new one. + + .. code:: shell-session + :class: m-nopad m-console-wrap + + Level id:0, center:[11.0210495 3.996935 3.3452997], dims:[ 43.0625 8.19569 -30.1122 ] + Region id:0_0, category:rec/game, center:[16.61225 2.7802274 11.577564 ], dims:[10.364299 5.5838847 -4.14447 ] + Object id:0_0_0, category:ceiling, center:[16.5905 4.54488 11.269 ], dims:[9.984315 4.0917997 2.1377602] + Object id:0_0_1, category:wall, center:[16.5865 2.6818905 13.4147 ], dims:[9.69278 0.5280709 5.4398193] + Object id:0_0_2, category:wall, center:[21.6013 1.7400599 11.3493 ], dims:[3.5423203 0.41668844 3.921341 ] + Object id:0_0_3, category:door, center:[11.5374 1.2431393 10.386599 ], dims:[1.2573967 2.5311599 0.41445923] + Object id:0_0_4, category:door, center:[20.6332 1.2136002 13.5958 ], dims:[0.15834427 2.4860601 1.1674671 ] + Object id:0_0_5, category:wall, center:[16.5946 2.66614 9.331001], dims:[9.72554 0.23693037 5.3787804 ] + Object id:0_0_6, category:window, center:[16.5822 2.852209 13.596898], dims:[1.5934639 0.16375065 1.2588081 ] + Object id:0_0_7, category:beam, center:[16.6094 5.32839 11.348299], dims:[0.5116577 0.35226822 3.8936386 ] + Object id:0_0_8, category:floor, center:[16.586 0.07907867 11.406 ], dims:[10.48608 4.3792195 0.2833004] + Object id:0_0_9, category:lighting, center:[11.798 1.9214487 11.313999 ], dims:[0.25683594 0.5076561 0.15560722] + Object id:0_0_10, category:wall, center:[11.57 1.7476702 11.3347 ], dims:[3.54352 0.41701245 3.9231815 ] + Object id:0_0_11, category:misc, center:[16.5943 2.29591 11.4341 ], dims:[10.428299 4.48172 4.676901] + Object id:0_0_12, category:door, center:[11.5234 1.2489185 12.228199 ], dims:[1.2521439 2.5423803 0.46386147] + Object id:0_0_13, category:door, center:[16.5833 1.1790485 13.490699 ], dims:[5.45306 0.3474083 2.4161606] + Object id:0_0_14, category:window, center:[21.6362 1.2518396 12.2613 ], dims:[1.1998444 2.5486398 0.37800598] + +`Actions and sensors`_ +====================== + +.. code:: py + :class: m-console-wrap + + import numpy as np + from PIL import Image + from habitat_sim.utils.common import d3_40_colors_rgb + + def display_sample(rgb_obs, semantic_obs, depth_obs): + rgb_img = Image.fromarray(rgb_obs, mode="RGB") + + semantic_img = Image.new("P", (semantic_obs.shape[1], semantic_obs.shape[0])) + semantic_img.putpalette(d3_40_colors_rgb.flatten()) + semantic_img.putdata((semantic_obs.flatten() % 40).astype(np.uint8)) + semantic_img = semantic_img.convert("RGBA") + + depth_img = Image.fromarray((depth_obs * 255).astype(np.uint8), mode="L") + + arr = [rgb_img, semantic_img, depth_img] + + titles = ['rgb', 'semantic', 'depth'] + plt.figure(figsize=(12 ,8)) + for i, data in enumerate(arr): + ax = plt.subplot(1, 3, i+1) + ax.axis('off') + ax.set_title(titles[i]) + plt.imshow(data) + plt.show() + + config = habitat.get_config(config_paths='../configs/tasks/pointnav_mp3d.yaml') + config.defrost() + config.DATASET.DATA_PATH = '../data/datasets/pointnav/mp3d/v1/val/val.json.gz' + config.DATASET.SCENES_DIR = '../data/scene_datasets/' + config.SIMULATOR.AGENT_0.SENSORS = ['RGB_SENSOR', 'DEPTH_SENSOR', 'SEMANTIC_SENSOR'] + config.SIMULATOR.SEMANTIC_SENSOR.WIDTH = 256 + config.SIMULATOR.SEMANTIC_SENSOR.HEIGHT = 256 + config.SIMULATOR.TURN_ANGLE = 30 + config.freeze() + + env = habitat.Env(config=config) + env.episodes = random.sample(env.episodes, 2) + + max_steps = 4 + + action_mapping = { + 0: 'stop', + 1: 'move_forward', + 2: 'turn left', + 3: 'turn right' + } + + for i in range(len(env.episodes)): + observations = env.reset() + + display_sample(observations['rgb'], observations['semantic'], np.squeeze(observations['depth'])) + + count_steps = 0 + while count_steps < max_steps: + action = random.choice(list(action_mapping.keys())) + print(action_mapping[action]) + observations = env.step(action) + display_sample(observations['rgb'], observations['semantic'], np.squeeze(observations['depth'])) + + count_steps += 1 + if env.episode_over: + break + + env.close() + +.. image:: habitat-lab-demo.png + :alt: Actions and sensors diff --git a/habitat-lab-dialog/docs/pages/habitat-sim-demo.png b/habitat-lab-dialog/docs/pages/habitat-sim-demo.png new file mode 100644 index 0000000..2763b13 Binary files /dev/null and b/habitat-lab-dialog/docs/pages/habitat-sim-demo.png differ diff --git a/habitat-lab-dialog/docs/pages/habitat-sim-demo.rst b/habitat-lab-dialog/docs/pages/habitat-sim-demo.rst new file mode 100644 index 0000000..53724cf --- /dev/null +++ b/habitat-lab-dialog/docs/pages/habitat-sim-demo.rst @@ -0,0 +1,227 @@ +Habitat Sim Demo +################ + +.. button-primary:: https://dl.fbaipublicfiles.com/habitat/notebooks/habitat-sim-demo.ipynb + + Download notebook + + habitat-sim-demo.ipynb + +.. contents:: + :class: m-block m-default + +.. code:: py + + import habitat_sim + + import random + %matplotlib inline + import matplotlib.pyplot as plt + + import numpy as np + + test_scene = "../data/scene_datasets/mp3d/17DRP5sb8fy/17DRP5sb8fy.glb" + + sim_settings = { + "width": 256, # Spatial resolution of the observations + "height": 256, + "scene": test_scene, # Scene path + "default_agent": 0, + "sensor_height": 1.5, # Height of sensors in meters + "color_sensor": True, # RGB sensor + "semantic_sensor": True, # Semantic sensor + "depth_sensor": True, # Depth sensor + "seed": 1, + } + +`Simulator config`_ +=================== + +.. code:: py + :class: m-console-wrap + + def make_cfg(settings): + sim_cfg = habitat_sim.SimulatorConfiguration() + sim_cfg.gpu_device_id = 0 + sim_cfg.scene_id = settings["scene"] + + # Note: all sensors must have the same resolution + sensors = { + "color_sensor": { + "sensor_type": habitat_sim.SensorType.COLOR, + "resolution": [settings["height"], settings["width"]], + "position": [0.0, settings["sensor_height"], 0.0], + }, + "depth_sensor": { + "sensor_type": habitat_sim.SensorType.DEPTH, + "resolution": [settings["height"], settings["width"]], + "position": [0.0, settings["sensor_height"], 0.0], + }, + "semantic_sensor": { + "sensor_type": habitat_sim.SensorType.SEMANTIC, + "resolution": [settings["height"], settings["width"]], + "position": [0.0, settings["sensor_height"], 0.0], + }, + } + + sensor_specs = [] + for sensor_uuid, sensor_params in sensors.items(): + if settings[sensor_uuid]: + sensor_spec = habitat_sim.SensorSpec() + sensor_spec.uuid = sensor_uuid + sensor_spec.sensor_type = sensor_params["sensor_type"] + sensor_spec.resolution = sensor_params["resolution"] + sensor_spec.position = sensor_params["position"] + + sensor_specs.append(sensor_spec) + + # Here you can specify the amount of displacement in a forward action and the turn angle + agent_cfg = habitat_sim.agent.AgentConfiguration() + agent_cfg.sensor_specifications = sensor_specs + agent_cfg.action_space = { + "move_forward": habitat_sim.agent.ActionSpec( + "move_forward", habitat_sim.agent.ActuationSpec(amount=0.25) + ), + "turn_left": habitat_sim.agent.ActionSpec( + "turn_left", habitat_sim.agent.ActuationSpec(amount=30.0) + ), + "turn_right": habitat_sim.agent.ActionSpec( + "turn_right", habitat_sim.agent.ActuationSpec(amount=30.0) + ), + } + + return habitat_sim.Configuration(sim_cfg, [agent_cfg]) + + cfg = make_cfg(sim_settings) + sim = habitat_sim.Simulator(cfg) + +`Scene semantic annotations`_ +============================= + +.. code-figure:: + + .. code:: py + :class: m-console-wrap + + def print_scene_recur(scene, limit_output=10): + print(f"House has {len(scene.levels)} levels, {len(scene.regions)} regions and {len(scene.objects)} objects") + print(f"House center:{scene.aabb.center} dims:{scene.aabb.sizes}") + + count = 0 + for level in scene.levels: + print( + f"Level id:{level.id}, center:{level.aabb.center}," + f" dims:{level.aabb.sizes}" + ) + for region in level.regions: + print( + f"Region id:{region.id}, category:{region.category.name()}," + f" center:{region.aabb.center}, dims:{region.aabb.sizes}" + ) + for obj in region.objects: + print( + f"Object id:{obj.id}, category:{obj.category.name()}," + f" center:{obj.aabb.center}, dims:{obj.aabb.sizes}" + ) + count += 1 + if count >= limit_output: + return None + + # Print semantic annotation information (id, category, bounding box details) + # about levels, regions and objects in a hierarchical fashion + scene = sim.semantic_scene + print_scene_recur(scene) + + .. code:: shell-session + :class: m-nopad m-console-wrap + + House has 1 levels, 10 regions and 187 objects + House center:[-2.7928102 1.3372793 -1.5051247] dims:[17.57338 2.9023628 -8.8595495] + Level id:0, center:[-3.157365 1.3372804 -1.5051247], dims:[16.69967 2.9023607 -8.8595495] + Region id:0_0, category:bedroom, center:[-8.821845 1.259409 -2.6915383], dims:[ 4.1633096 2.5356617 -4.207343 ] + Object id:0_0_0, category:wall, center:[-8.86568 1.2817702 -2.73879 ], dims:[2.58148 4.5891 4.59182] + Object id:0_0_1, category:ceiling, center:[-8.91329 2.20326 -2.80575], dims:[4.4761996 4.46008 0.7124357] + Object id:0_0_2, category:misc, center:[-8.69572 1.1633401 -4.2134695], dims:[2.5021195 0.61951023 2.34074 ] + Object id:0_0_3, category:curtain, center:[-10.9129 1.0454602 -2.9228697], dims:[2.134861 0.49171448 3.8549194 ] + Object id:0_0_4, category:void, center:[-8.06444 1.4491596 -1.7219999], dims:[0.8975539 1.5347222 0.6184306] + Object id:0_0_5, category:bed, center:[-8.71032 0.6567161 -2.7839994], dims:[1.2672672 2.0257597 2.45652 ] + Object id:0_0_6, category:void, center:[-6.79918 1.40336 -1.91666], dims:[0.08472061 0.8195841 0.28476596] + Object id:0_0_7, category:tv_monitor, center:[-10.9803 1.01896 -1.43764], dims:[1.0417404 0.5545361 1.2688993] + Object id:0_0_9, category:chest_of_drawers, center:[-9.89281 0.31491923 -3.5474799 ], dims:[0.47650528 0.63675606 0.57509613] + Object id:0_0_10, category:cushion, center:[-9.2041 0.5827892 -3.71507 ], dims:[1.0096397 0.31469202 0.90284204] + +.. code-figure:: + + .. code:: py + + random.seed(sim_settings["seed"]) + sim.seed(sim_settings["seed"]) + + # Set agent state + agent = sim.initialize_agent(sim_settings["default_agent"]) + agent_state = habitat_sim.AgentState() + agent_state.position = np.array([0.0, 0.072447, 0.0]) + agent.set_state(agent_state) + + # Get agent state + agent_state = agent.get_state() + print("agent_state: position", agent_state.position, "rotation", agent_state.rotation) + + .. code:: shell-session + :class: m-nopad m-console-wrap + + agent_state: position [0. 0.072447 0. ] rotation quaternion(1, 0, 0, 0) + +.. code:: py + + from PIL import Image + from habitat_sim.utils.common import d3_40_colors_rgb + + def display_sample(rgb_obs, semantic_obs, depth_obs): + rgb_img = Image.fromarray(rgb_obs, mode="RGBA") + + semantic_img = Image.new("P", (semantic_obs.shape[1], semantic_obs.shape[0])) + semantic_img.putpalette(d3_40_colors_rgb.flatten()) + semantic_img.putdata((semantic_obs.flatten() % 40).astype(np.uint8)) + semantic_img = semantic_img.convert("RGBA") + + depth_img = Image.fromarray((depth_obs / 10 * 255).astype(np.uint8), mode="L") + + arr = [rgb_img, semantic_img, depth_img] + titles = ['rgb', 'semantic', 'depth'] + plt.figure(figsize=(12 ,8)) + for i, data in enumerate(arr): + ax = plt.subplot(1, 3, i+1) + ax.axis('off') + ax.set_title(titles[i]) + plt.imshow(data) + plt.show() + +`Random actions`_ +================= + +.. code:: py + + total_frames = 0 + action_names = list( + cfg.agents[ + sim_settings["default_agent"] + ].action_space.keys() + ) + + max_frames = 5 + + while total_frames < max_frames: + action = random.choice(action_names) + print("action", action) + observations = sim.step(action) + rgb = observations["color_sensor"] + semantic = observations["semantic_sensor"] + depth = observations["depth_sensor"] + + display_sample(rgb, semantic, depth) + + total_frames += 1 + +.. image:: habitat-sim-demo.png + :alt: Actions and sensors diff --git a/habitat-lab-dialog/docs/pages/index.rst b/habitat-lab-dialog/docs/pages/index.rst new file mode 100644 index 0000000..7b59eef --- /dev/null +++ b/habitat-lab-dialog/docs/pages/index.rst @@ -0,0 +1,24 @@ +Habitat Lab Documentation +######################### + +A modular high-level library to train embodied AI agents across a variety of +tasks, environments, and simulators. + +`Tutorials`_ +============ + +- :ref:`Quickstart ` +- :ref:`Habitat Sim Demo ` +- :ref:`Habitat Lab Demo ` +- :ref:`View, Transform and Warp ` + +`Package reference`_ +==================== + +- :ref:`habitat.core.env` +- :ref:`habitat.core.embodied_task` +- :ref:`habitat.core.dataset` +- :ref:`habitat.core.simulator` +- :ref:`habitat.core.vector_env` +- :ref:`habitat.Agent` +- :ref:`habitat.Benchmark` diff --git a/habitat-lab-dialog/docs/pages/quickstart.png b/habitat-lab-dialog/docs/pages/quickstart.png new file mode 100644 index 0000000..891a53c Binary files /dev/null and b/habitat-lab-dialog/docs/pages/quickstart.png differ diff --git a/habitat-lab-dialog/docs/pages/quickstart.rst b/habitat-lab-dialog/docs/pages/quickstart.rst new file mode 100644 index 0000000..ef97159 --- /dev/null +++ b/habitat-lab-dialog/docs/pages/quickstart.rst @@ -0,0 +1,152 @@ +Quickstart +########## + +In this quickstart we will briefly introduce the habitat stack using which we +will setup the pointnav task and step around in the environment. + +.. role:: sh(code) + :language: sh + +`Habitat`_ +========== + +Habitat is a platform for embodied AI research that consists of: + +1. **Habitat-Sim**: A flexible, high-performance 3D simulator with + configurable agents, multiple sensors, and generic 3D dataset handling + (with built-in support for + `MatterPort3D `_, + `Gibson `_ and other datasets). + :gh:`[github-repo] ` + +2. **Habitat Lab**: A modular high-level library for end-to-end development in + embodied AI --- defining embodied AI tasks (e.g. navigation, instruction + following, question answering), configuring embodied agents (physical form, + sensors, capabilities), training these agents (via imitation or + reinforcement learning, or no learning at all as in classical SLAM), and + benchmarking their performance on the defined tasks using standard metrics. + :gh:`[github-repo] ` + +For installing Habitat-Sim and Habitat Lab follow instructions +`here `_. + +`Example`_ +========== + +In this example we will setup a PointNav task in which the agent is tasked to +go from a source location to a target location. For this example the agent will +be you (the user). You will be able to step around in an environment using +keys. + +For running this example both Habitat-Sim and Habitat Lab should be installed +successfully. The data for scene should also be downloaded (steps to do this +are provided in the `installation instructions `_ +of Habitat Lab). Running the code below also requires installation of cv2 which +you can install using: :sh:`pip install opencv-python`. + +.. code:: py + + import habitat + from habitat.sims.habitat_simulator.actions import HabitatSimActions + import cv2 + + + FORWARD_KEY="w" + LEFT_KEY="a" + RIGHT_KEY="d" + FINISH="f" + + + def transform_rgb_bgr(image): + return image[:, :, [2, 1, 0]] + + + def example(): + env = habitat.Env( + config=habitat.get_config("configs/tasks/pointnav.yaml") + ) + + print("Environment creation successful") + observations = env.reset() + print("Destination, distance: {:3f}, theta(radians): {:.2f}".format( + observations["pointgoal_with_gps_compass"][0], + observations["pointgoal_with_gps_compass"][1])) + cv2.imshow("RGB", transform_rgb_bgr(observations["rgb"])) + + print("Agent stepping around inside environment.") + + count_steps = 0 + while not env.episode_over: + keystroke = cv2.waitKey(0) + + if keystroke == ord(FORWARD_KEY): + action = HabitatSimActions.MOVE_FORWARD + print("action: FORWARD") + elif keystroke == ord(LEFT_KEY): + action = HabitatSimActions.TURN_LEFT + print("action: LEFT") + elif keystroke == ord(RIGHT_KEY): + action = HabitatSimActions.TURN_RIGHT + print("action: RIGHT") + elif keystroke == ord(FINISH): + action = HabitatSimActions.STOP + print("action: FINISH") + else: + print("INVALID KEY") + continue + + observations = env.step(action) + count_steps += 1 + + print("Destination, distance: {:3f}, theta(radians): {:.2f}".format( + observations["pointgoal_with_gps_compass"][0], + observations["pointgoal_with_gps_compass"][1])) + cv2.imshow("RGB", transform_rgb_bgr(observations["rgb"])) + + print("Episode finished after {} steps.".format(count_steps)) + + if ( + action == HabitatSimActions.STOP + and observations["pointgoal_with_gps_compass"][0] < 0.2 + ): + print("you successfully navigated to destination point") + else: + print("your navigation was unsuccessful") + + + if __name__ == "__main__": + example() + +Running the above code will initialize an agent inside an environment, you can +move around in the environment using :label-default:`W`, :label-default:`A`, +:label-default:`D`, :label-default:`F` keys. On the terminal a destination +vector in polar format will be printed with distance to goal and angle to goal. +Once you are withing 0.2m of goal you can press the :label-default:`F` key to +``STOP`` and finish the episode successfully. If your finishing distance to +goal is :math:`> 0.2m` or if you spend more than 500 steps in the environment +your episode will be unsuccessful. + +Below is a demo of what the example output will look like: + +.. image:: quickstart.png + +For more examples refer to +:gh:`Habitat Lab examples ` +and :gh:`Habitat-Sim examples `. + + + +`Citation`_ +=========== + +If you use habitat in your work, please cite: + +.. code:: bibtex + :class: m-console-wrap + + @article{habitat19arxiv, + title = {Habitat: A Platform for Embodied AI Research}, + author = {Manolis Savva, Abhishek Kadian, Oleksandr Maksymets, Yili Zhao, Erik Wijmans, Bhavana Jain, Julian Straub, Jia Liu, Vladlen Koltun, Jitendra Malik, Devi Parikh and Dhruv Batra}, + journal = {arXiv preprint arXiv:1904.01201}, + year = {2019} + } diff --git a/habitat-lab-dialog/docs/pages/view-transform-warp.png b/habitat-lab-dialog/docs/pages/view-transform-warp.png new file mode 100644 index 0000000..cfeef59 Binary files /dev/null and b/habitat-lab-dialog/docs/pages/view-transform-warp.png differ diff --git a/habitat-lab-dialog/docs/pages/view-transform-warp.rst b/habitat-lab-dialog/docs/pages/view-transform-warp.rst new file mode 100644 index 0000000..dc79742 --- /dev/null +++ b/habitat-lab-dialog/docs/pages/view-transform-warp.rst @@ -0,0 +1,207 @@ +View, Transform and Warp +######################## + +.. button-primary:: https://dl.fbaipublicfiles.com/habitat/notebooks/relative_camera_views_transform_and_warping_demo.ipynb + + Download notebook + + relative_cam…demo.ipynb + +.. contents:: + :class: m-block m-default + +Demonstrates how to extract camera parameters in the scene and how these camera +parameters relate to the given views. We create 2 cameras and use RGB and Depth +information to construct transformation from a view of camera 1 to camera 2 and +validate that transformation comparing projected and original views. + +.. code-figure:: + + .. code:: py + :class: m-console-wrap + + import os + import numpy as np + import quaternion + + import matplotlib.pyplot as plt + %matplotlib inline + + import habitat + + import torch.nn.functional as F + import torch + from torchvision.transforms import ToTensor + + # Set up the environment for testing + config = habitat.get_config(config_paths="../configs/tasks/pointnav_rgbd.yaml") + config.defrost() + config.DATASET.DATA_PATH = '../data/datasets/pointnav/habitat-test-scenes/v1/val/val.json.gz' + config.DATASET.SCENES_DIR = '../data/scene_datasets/' + config.freeze() + + # Can also do directly in the config file + config.defrost() + config.SIMULATOR.DEPTH_SENSOR.NORMALIZE_DEPTH = False + config.freeze() + + # Intrinsic parameters, assuming width matches height. Requires a simple refactor otherwise + W = config.SIMULATOR.DEPTH_SENSOR.WIDTH + H = config.SIMULATOR.DEPTH_SENSOR.HEIGHT + + assert(W == H) + hfov = float(config.SIMULATOR.DEPTH_SENSOR.HFOV) * np.pi / 180. + + + env = habitat.Env(config=config) + + + obs = env.reset() + initial_state = env._sim.get_agent_state(0) + init_translation = initial_state.position + init_rotation = initial_state.rotation + + .. code:: shell-session + :class: m-nopad m-console-wrap + + 2019-06-11 10:03:34,049 initializing sim Sim-v0 + I0611 10:03:34.056092 64715 simulator.py:78] Loaded navmesh ../data/scene_datasets/habitat-test-scenes/skokloster-castle.navmesh + 2019-06-11 10:03:35,053 initializing task Nav-v0 + +`Randomly permute the rotation`_ +================================ + +.. code:: py + + def uniform_quat(original_angle): + original_euler = quaternion.as_euler_angles(original_angle) + euler_angles = np.array([(np.random.rand() - 0.5) * np.pi / 9. + original_euler[0], + (np.random.rand() - 0.5) * np.pi / 9. + original_euler[1], + (np.random.rand() - 0.5) * np.pi / 9. + original_euler[2]]) + quaternions = quaternion.from_euler_angles(euler_angles) + + + return quaternions + +`Generate two random, overlapping views`_ +========================================= + +.. code:: py + :class: m-console-wrap + + depths = [] + rgbs = [] + cameras = [] + + + for i in range(0, 2): + rotation = uniform_quat(init_rotation) + translation = init_translation + np.random.rand(3,) * 0.5 - 0.25 + + obs = env._sim.get_observations_at(position=translation, rotation=rotation, keep_agent_at_new_pose=True) + depths += [obs["depth"][...,0]] + rgbs += [obs["rgb"]] + + cameras += [env._sim.get_agent_state()] + + env.close() + +`Intrinsic parameters, K`_ +========================== + +.. code:: py + + K = np.array([ + [1 / np.tan(hfov / 2.), 0., 0., 0.], + [0., 1 / np.tan(hfov / 2.), 0., 0.], + [0., 0., 1, 0], + [0., 0., 0, 1]]) + + # Now get an approximation for the true world coordinates -- see if they make sense + # [-1, 1] for x and [1, -1] for y as array indexing is y-down while world is y-up + xs, ys = np.meshgrid(np.linspace(-1,1,W), np.linspace(1,-1,W)) + depth = depths[0].reshape(1,W,W) + xs = xs.reshape(1,W,W) + ys = ys.reshape(1,W,W) + + # Unproject + # negate depth as the camera looks along -Z + xys = np.vstack((xs * depth , ys * depth, -depth, np.ones(depth.shape))) + xys = xys.reshape(4, -1) + xy_c0 = np.matmul(np.linalg.inv(K), xys) + + # Now load in the cameras, are in the format camera --> world + # Camera 1: + quaternion_0 = cameras[0].sensor_states['depth'].rotation + translation_0 = cameras[0].sensor_states['depth'].position + rotation_0 = quaternion.as_rotation_matrix(quaternion_0) + T_world_camera0 = np.eye(4) + T_world_camera0[0:3,0:3] = rotation_0 + T_world_camera0[0:3,3] = translation_0 + + # Camera 2: + translation_1 = cameras[1].sensor_states['depth'].position + quaternion_1 = cameras[1].sensor_states['depth'].rotation + rotation_1 = quaternion.as_rotation_matrix(quaternion_1) + T_world_camera1 = np.eye(4) + T_world_camera1[0:3,0:3] = rotation_1 + T_world_camera1[0:3,3] = translation_1 + + # Invert to get world --> camera + T_camera1_world = np.linalg.inv(T_world_camera1) + + # Transformation matrix between views + # Aka the position of camera0 in camera1's coordinate frame + T_camera1_camera0 = np.matmul(T_camera1_world, T_world_camera0) + + # Finally transform actual points + xy_c1 = np.matmul(T_camera1_camera0, xy_c0) + xy_newimg = np.matmul(K, xy_c1) + + # Normalize by negative depth + xys_newimg = xy_newimg[0:2,:] / -xy_newimg[2:3,:] + # Flip back to y-down to match array indexing + xys_newimg[1] *= -1 + +`And visualise this new transformation`_ +======================================== + +.. code-figure:: + + .. code:: py + :class: m-console-wrap + + # Create sampler + sampler = torch.Tensor(xys_newimg).view(2, W, W).permute(1,2,0).unsqueeze(0) + + # Create generated image + img1_tensor = ToTensor()(rgbs[0]).unsqueeze(0) + img2_tensor = ToTensor()(rgbs[1]).unsqueeze(0) + img2_warped = F.grid_sample(img2_tensor, sampler) + + # Visualise + plt.figure(figsize=(10,10)) + ax1 = plt.subplot(221) + ax1.imshow(img1_tensor.squeeze().permute(1,2,0)) + ax1.set_title("View 1", fontsize='large') + ax1.axis('off') + ax1 = plt.subplot(222) + ax1.imshow(img2_tensor.squeeze().permute(1,2,0)) + ax1.set_title("View 2", fontsize='large') + ax1.axis('off') + ax1 = plt.subplot(223) + plt.imshow(img2_warped.squeeze().permute(1,2,0)) + ax1.set_title("View 2 warped into View 1 \n according to the estimated transformation", fontsize='large') + ax1.axis('off') + ax1 = plt.subplot(224) + ax1.imshow(np.abs(img2_warped.squeeze().permute(1,2,0) - img1_tensor.squeeze().permute(1,2,0))) + ax1.set_title("Difference between warped \n and ground truth images", fontsize='large') + ax1.axis('off') + + .. code:: shell-session + :class: m-nopad + + (-0.5, 255.5, 255.5, -0.5) + +.. image:: view-transform-warp.png + :alt: View, Transform and Warp diff --git a/habitat-lab-dialog/examples/__init__.py b/habitat-lab-dialog/examples/__init__.py new file mode 100644 index 0000000..240697e --- /dev/null +++ b/habitat-lab-dialog/examples/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/habitat-lab-dialog/examples/benchmark.py b/habitat-lab-dialog/examples/benchmark.py new file mode 100644 index 0000000..c0a3416 --- /dev/null +++ b/habitat-lab-dialog/examples/benchmark.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse + +import habitat +from habitat.sims.habitat_simulator.actions import HabitatSimActions + + +class ForwardOnlyAgent(habitat.Agent): + def reset(self): + pass + + def act(self, observations): + action = HabitatSimActions.MOVE_FORWARD + return {"action": action} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--task-config", type=str, default="configs/tasks/pointnav.yaml" + ) + args = parser.parse_args() + + agent = ForwardOnlyAgent() + benchmark = habitat.Benchmark(args.task_config) + metrics = benchmark.evaluate(agent, num_episodes=10) + + for k, v in metrics.items(): + print("{}: {:.3f}".format(k, v)) + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/examples/example.py b/habitat-lab-dialog/examples/example.py new file mode 100644 index 0000000..24e5054 --- /dev/null +++ b/habitat-lab-dialog/examples/example.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import habitat + + +def example(): + # Note: Use with for the example testing, doesn't need to be like this on the README + + with habitat.Env( + config=habitat.get_config("configs/tasks/pointnav.yaml") + ) as env: + print("Environment creation successful") + observations = env.reset() # noqa: F841 + + print("Agent stepping around inside environment.") + count_steps = 0 + while not env.episode_over: + observations = env.step(env.action_space.sample()) # noqa: F841 + count_steps += 1 + print("Episode finished after {} steps.".format(count_steps)) + + +if __name__ == "__main__": + example() diff --git a/habitat-lab-dialog/examples/new_actions.py b/habitat-lab-dialog/examples/new_actions.py new file mode 100644 index 0000000..eb962d1 --- /dev/null +++ b/habitat-lab-dialog/examples/new_actions.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +r""" +This is an example of how to add new actions to habitat-lab + + +We will use the strafe action outline in the habitat_sim example +""" + +import attr +import numpy as np + +import habitat +import habitat_sim +from habitat.sims.habitat_simulator.actions import ( + HabitatSimActions, + HabitatSimV1ActionSpaceConfiguration, +) +from habitat.tasks.nav.nav import SimulatorTaskAction + + +@attr.s(auto_attribs=True, slots=True) +class NoisyStrafeActuationSpec: + move_amount: float + # Classic strafing is to move perpendicular (90 deg) to the forward direction + strafe_angle: float = 90.0 + noise_amount: float = 0.05 + + +def _strafe_impl( + scene_node: habitat_sim.SceneNode, + move_amount: float, + strafe_angle: float, + noise_amount: float, +): + forward_ax = ( + np.array(scene_node.absolute_transformation().rotation_scaling()) + @ habitat_sim.geo.FRONT + ) + strafe_angle = np.deg2rad(strafe_angle) + strafe_angle = np.random.uniform( + (1 - noise_amount) * strafe_angle, (1 + noise_amount) * strafe_angle + ) + + rotation = habitat_sim.utils.quat_from_angle_axis( + strafe_angle, habitat_sim.geo.UP + ) + move_ax = habitat_sim.utils.quat_rotate_vector(rotation, forward_ax) + + move_amount = np.random.uniform( + (1 - noise_amount) * move_amount, (1 + noise_amount) * move_amount + ) + scene_node.translate_local(move_ax * move_amount) + + +@habitat_sim.registry.register_move_fn(body_action=True) +class NoisyStrafeLeft(habitat_sim.SceneNodeControl): + def __call__( + self, + scene_node: habitat_sim.SceneNode, + actuation_spec: NoisyStrafeActuationSpec, + ): + print(f"strafing left with noise_amount={actuation_spec.noise_amount}") + _strafe_impl( + scene_node, + actuation_spec.move_amount, + actuation_spec.strafe_angle, + actuation_spec.noise_amount, + ) + + +@habitat_sim.registry.register_move_fn(body_action=True) +class NoisyStrafeRight(habitat_sim.SceneNodeControl): + def __call__( + self, + scene_node: habitat_sim.SceneNode, + actuation_spec: NoisyStrafeActuationSpec, + ): + print( + f"strafing right with noise_amount={actuation_spec.noise_amount}" + ) + _strafe_impl( + scene_node, + actuation_spec.move_amount, + -actuation_spec.strafe_angle, + actuation_spec.noise_amount, + ) + + +@habitat.registry.register_action_space_configuration +class NoNoiseStrafe(HabitatSimV1ActionSpaceConfiguration): + def get(self): + config = super().get() + + config[HabitatSimActions.STRAFE_LEFT] = habitat_sim.ActionSpec( + "noisy_strafe_left", + NoisyStrafeActuationSpec(0.25, noise_amount=0.0), + ) + config[HabitatSimActions.STRAFE_RIGHT] = habitat_sim.ActionSpec( + "noisy_strafe_right", + NoisyStrafeActuationSpec(0.25, noise_amount=0.0), + ) + + return config + + +@habitat.registry.register_action_space_configuration +class NoiseStrafe(HabitatSimV1ActionSpaceConfiguration): + def get(self): + config = super().get() + + config[HabitatSimActions.STRAFE_LEFT] = habitat_sim.ActionSpec( + "noisy_strafe_left", + NoisyStrafeActuationSpec(0.25, noise_amount=0.05), + ) + config[HabitatSimActions.STRAFE_RIGHT] = habitat_sim.ActionSpec( + "noisy_strafe_right", + NoisyStrafeActuationSpec(0.25, noise_amount=0.05), + ) + + return config + + +@habitat.registry.register_task_action +class StrafeLeft(SimulatorTaskAction): + def _get_uuid(self, *args, **kwargs) -> str: + return "strafe_left" + + def step(self, *args, **kwargs): + return self._sim.step(HabitatSimActions.STRAFE_LEFT) + + +@habitat.registry.register_task_action +class StrafeRight(SimulatorTaskAction): + def _get_uuid(self, *args, **kwargs) -> str: + return "strafe_right" + + def step(self, *args, **kwargs): + return self._sim.step(HabitatSimActions.STRAFE_RIGHT) + + +def main(): + HabitatSimActions.extend_action_space("STRAFE_LEFT") + HabitatSimActions.extend_action_space("STRAFE_RIGHT") + + config = habitat.get_config(config_paths="configs/tasks/pointnav.yaml") + config.defrost() + + config.TASK.POSSIBLE_ACTIONS = config.TASK.POSSIBLE_ACTIONS + [ + "STRAFE_LEFT", + "STRAFE_RIGHT", + ] + config.TASK.ACTIONS.STRAFE_LEFT = habitat.config.Config() + config.TASK.ACTIONS.STRAFE_LEFT.TYPE = "StrafeLeft" + config.TASK.ACTIONS.STRAFE_RIGHT = habitat.config.Config() + config.TASK.ACTIONS.STRAFE_RIGHT.TYPE = "StrafeRight" + config.SIMULATOR.ACTION_SPACE_CONFIG = "NoNoiseStrafe" + config.freeze() + + with habitat.Env(config=config) as env: + env.reset() + env.step("STRAFE_LEFT") + env.step("STRAFE_RIGHT") + + config.defrost() + config.SIMULATOR.ACTION_SPACE_CONFIG = "NoiseStrafe" + config.freeze() + + with habitat.Env(config=config) as env: + env.reset() + env.step("STRAFE_LEFT") + env.step("STRAFE_RIGHT") + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/examples/register_new_sensors_and_measures.py b/habitat-lab-dialog/examples/register_new_sensors_and_measures.py new file mode 100644 index 0000000..d9e2ace --- /dev/null +++ b/habitat-lab-dialog/examples/register_new_sensors_and_measures.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any + +import numpy as np +from gym import spaces + +import habitat + + +# Define the measure and register it with habitat +# By default, the things are registered with the class name +@habitat.registry.register_measure +class EpisodeInfoExample(habitat.Measure): + def __init__(self, sim, config, **kwargs: Any): + # This measure only needs the config + self._config = config + + super().__init__() + + # Defines the name of the measure in the measurements dictionary + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "episode_info" + + # This is called whenver the environment is reset + def reset_metric(self, *args: Any, episode, **kwargs: Any): + # Our measure always contains all the attributes of the episode + self._metric = vars(episode).copy() + # But only on reset, it has an additional field of my_value + self._metric["my_value"] = self._config.VALUE + + # This is called whenver an action is taken in the environment + def update_metric(self, *args: Any, episode, action, **kwargs: Any): + # Now the measure will just have all the attributes of the episode + self._metric = vars(episode).copy() + + +# Define the sensor and register it with habitat +# For the sensor, we will register it with a custom name +@habitat.registry.register_sensor(name="my_supercool_sensor") +class AgentPositionSensor(habitat.Sensor): + def __init__(self, sim, config, **kwargs: Any): + super().__init__(config=config) + + self._sim = sim + # Prints out the answer to life on init + print("The answer to life is", self.config.ANSWER_TO_LIFE) + + # Defines the name of the sensor in the sensor suite dictionary + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "agent_position" + + # Defines the type of the sensor + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return habitat.SensorTypes.POSITION + + # Defines the size and range of the observations of the sensor + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=(3,), + dtype=np.float32, + ) + + # This is called whenver reset is called or an action is taken + def get_observation( + self, observations, *args: Any, episode, **kwargs: Any + ): + return self._sim.get_agent_state().position + + +def main(): + # Get the default config node + config = habitat.get_config(config_paths="configs/tasks/pointnav.yaml") + config.defrost() + + # Add things to the config to for the measure + config.TASK.EPISODE_INFO_EXAMPLE = habitat.Config() + # The type field is used to look-up the measure in the registry. + # By default, the things are registered with the class name + config.TASK.EPISODE_INFO_EXAMPLE.TYPE = "EpisodeInfoExample" + config.TASK.EPISODE_INFO_EXAMPLE.VALUE = 5 + # Add the measure to the list of measures in use + config.TASK.MEASUREMENTS.append("EPISODE_INFO_EXAMPLE") + + # Now define the config for the sensor + config.TASK.AGENT_POSITION_SENSOR = habitat.Config() + # Use the custom name + config.TASK.AGENT_POSITION_SENSOR.TYPE = "my_supercool_sensor" + config.TASK.AGENT_POSITION_SENSOR.ANSWER_TO_LIFE = 42 + # Add the sensor to the list of sensors in use + config.TASK.SENSORS.append("AGENT_POSITION_SENSOR") + config.freeze() + + with habitat.Env(config=config) as env: + print(env.reset()["agent_position"]) + print(env.get_metrics()["episode_info"]) + print(env.step("MOVE_FORWARD")["agent_position"]) + print(env.get_metrics()["episode_info"]) + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/examples/shortest_path_follower_example.py b/habitat-lab-dialog/examples/shortest_path_follower_example.py new file mode 100644 index 0000000..52e6861 --- /dev/null +++ b/habitat-lab-dialog/examples/shortest_path_follower_example.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import shutil + +import numpy as np + +import habitat +from habitat.core.utils import try_cv2_import +from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower +from habitat.utils.visualizations import maps +from habitat.utils.visualizations.utils import images_to_video + +cv2 = try_cv2_import() + +IMAGE_DIR = os.path.join("examples", "images") +if not os.path.exists(IMAGE_DIR): + os.makedirs(IMAGE_DIR) + + +class SimpleRLEnv(habitat.RLEnv): + def get_reward_range(self): + return [-1, 1] + + def get_reward(self, observations): + return 0 + + def get_done(self, observations): + return self.habitat_env.episode_over + + def get_info(self, observations): + return self.habitat_env.get_metrics() + + +def draw_top_down_map(info, output_size): + return maps.colorize_draw_agent_and_fit_to_height( + info["top_down_map"], output_size + ) + + +def shortest_path_example(): + config = habitat.get_config(config_paths="configs/tasks/pointnav.yaml") + config.defrost() + config.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.freeze() + with SimpleRLEnv(config=config) as env: + goal_radius = env.episodes[0].goals[0].radius + if goal_radius is None: + goal_radius = config.SIMULATOR.FORWARD_STEP_SIZE + follower = ShortestPathFollower( + env.habitat_env.sim, goal_radius, False + ) + + print("Environment creation successful") + for episode in range(3): + env.reset() + dirname = os.path.join( + IMAGE_DIR, "shortest_path_example", "%02d" % episode + ) + if os.path.exists(dirname): + shutil.rmtree(dirname) + os.makedirs(dirname) + print("Agent stepping around inside environment.") + images = [] + while not env.habitat_env.episode_over: + best_action = follower.get_next_action( + env.habitat_env.current_episode.goals[0].position + ) + if best_action is None: + break + + observations, reward, done, info = env.step(best_action) + im = observations["rgb"] + top_down_map = draw_top_down_map(info, im.shape[0]) + output_im = np.concatenate((im, top_down_map), axis=1) + images.append(output_im) + images_to_video(images, dirname, "trajectory") + print("Episode finished") + + +def main(): + shortest_path_example() + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/examples/tutorials/colabs/Habitat_Interactive_Tasks.ipynb b/habitat-lab-dialog/examples/tutorials/colabs/Habitat_Interactive_Tasks.ipynb new file mode 100644 index 0000000..3a41959 --- /dev/null +++ b/habitat-lab-dialog/examples/tutorials/colabs/Habitat_Interactive_Tasks.ipynb @@ -0,0 +1,2212 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Furniture Rearrangement - How to setup a new interaction task in Habitat-Lab\n", + "\n", + "This tutorial demonstrates how to setup a new task in Habitat that utilizes interaction capabilities in Habitat Simulator.\n", + "\n", + "![teaser](https://drive.google.com/uc?id=1pupGvb4dGefd0T_23GpeDkkcIocDHSL_)\n", + "\n", + "## Task Definition:\n", + "The working example in this demo will be the task of **Furniture Rearrangement** - The agent will be randomly spawned in an environment in which the furniture are initially displaced from their desired position. The agent is tasked with navigating the environment, picking furniture and putting them in the desired position. To keep the tutorial simple and easy to follow, we will rearrange just a single object.\n", + "\n", + "To setup this task, we will build on top of existing API in Habitat-Simulator and Habitat-Lab. Here is a summary of all the steps involved in setting up this task:\n", + "\n", + "1. **Setup the Simulator**: Using existing functionalities of the Habitat-Sim, we can add or remove objects from the scene. We will use these methods to spawn the agent and the objects at some pre-defined initial configuration.\n", + "2. **Create a New Dataset**: We will define a new dataset class to save / load a list of episodes for the agent to train and evaluate on.\n", + "3. **Grab / Release Action**: We will add the \"grab/release\" action to the agent's action space to allow the agent to pickup / drop an object under a crosshair.\n", + "4. **Extend the Simulator Class**: We will extend the Simulator Class to add support for new actions implemented in previous step and add other additional utility functions\n", + "5. **Create a New Task**: Create a new task definition, implement new *sensors* and *metrics*.\n", + "6. **Train an RL agent**: We will define rewards for this task and utilize it to train an RL agent using the PPO algorithm.\n", + "\n", + "Let's get started!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Installation { display-mode: \"form\" }\n", + "# @markdown (double click to show code).\n", + "\n", + "!curl -L https://raw.githubusercontent.com/facebookresearch/habitat-sim/master/examples/colab_utils/colab_install.sh | NIGHTLY=true bash -s\n", + "%cd /content\n", + "\n", + "!gdown --id 1Pc-J6pZzXEd8RSeLM94t3iwO8q_RQ853\n", + "!unzip -o /content/coda.zip -d /content/habitat-sim/data/scene_datasets\n", + "\n", + "# reload the cffi version\n", + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " import importlib\n", + "\n", + " import cffi\n", + "\n", + " importlib.reload(cffi)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# @title Path Setup and Imports { display-mode: \"form\" }\n", + "# @markdown (double click to show code).\n", + "\n", + "%cd /content/habitat-lab\n", + "\n", + "## [setup]\n", + "import gzip\n", + "import json\n", + "import os\n", + "import sys\n", + "from typing import Any, Dict, List, Optional, Type\n", + "\n", + "import attr\n", + "import cv2\n", + "import git\n", + "import magnum as mn\n", + "import numpy as np\n", + "\n", + "%matplotlib inline\n", + "from matplotlib import pyplot as plt\n", + "from PIL import Image\n", + "\n", + "import habitat\n", + "import habitat_sim\n", + "from habitat.config import Config\n", + "from habitat.core.registry import registry\n", + "from habitat_sim.utils import viz_utils as vut\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " os.environ[\"IMAGEIO_FFMPEG_EXE\"] = \"/usr/bin/ffmpeg\"\n", + "\n", + "repo = git.Repo(\".\", search_parent_directories=True)\n", + "dir_path = repo.working_tree_dir\n", + "%cd $dir_path\n", + "data_path = os.path.join(dir_path, \"data\")\n", + "output_directory = \"data/tutorials/output/\" # @param {type:\"string\"}\n", + "output_path = os.path.join(dir_path, output_directory)\n", + "\n", + "if __name__ == \"__main__\":\n", + " import argparse\n", + "\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"--no-display\", dest=\"display\", action=\"store_false\")\n", + " parser.add_argument(\n", + " \"--no-make-video\", dest=\"make_video\", action=\"store_false\"\n", + " )\n", + " parser.set_defaults(show_video=True, make_video=True)\n", + " args, _ = parser.parse_known_args()\n", + " show_video = args.display\n", + " display = args.display\n", + " make_video = args.make_video\n", + "else:\n", + " show_video = False\n", + " make_video = False\n", + " display = False\n", + "\n", + "if make_video and not os.path.exists(output_path):\n", + " os.makedirs(output_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# @title Util functions to visualize observations\n", + "# @markdown - `make_video_cv2`: Renders a video from a list of observations\n", + "# @markdown - `simulate`: Runs simulation for a given amount of time at 60Hz\n", + "# @markdown - `simulate_and_make_vid` Runs simulation and creates video\n", + "\n", + "\n", + "def make_video_cv2(\n", + " observations, cross_hair=None, prefix=\"\", open_vid=True, fps=60\n", + "):\n", + " sensor_keys = list(observations[0])\n", + " videodims = observations[0][sensor_keys[0]].shape\n", + " videodims = (videodims[1], videodims[0]) # flip to w,h order\n", + " print(videodims)\n", + " video_file = output_path + prefix + \".mp4\"\n", + " print(\"Encoding the video: %s \" % video_file)\n", + " writer = vut.get_fast_video_writer(video_file, fps=fps)\n", + " for ob in observations:\n", + " # If in RGB/RGBA format, remove the alpha channel\n", + " rgb_im_1st_person = cv2.cvtColor(ob[\"rgb\"], cv2.COLOR_RGBA2RGB)\n", + " if cross_hair is not None:\n", + " rgb_im_1st_person[\n", + " cross_hair[0] - 2 : cross_hair[0] + 2,\n", + " cross_hair[1] - 2 : cross_hair[1] + 2,\n", + " ] = [255, 0, 0]\n", + "\n", + " if rgb_im_1st_person.shape[:2] != videodims:\n", + " rgb_im_1st_person = cv2.resize(\n", + " rgb_im_1st_person, videodims, interpolation=cv2.INTER_AREA\n", + " )\n", + " # write the 1st person observation to video\n", + " writer.append_data(rgb_im_1st_person)\n", + " writer.close()\n", + "\n", + " if open_vid:\n", + " print(\"Displaying video\")\n", + " vut.display_video(video_file)\n", + "\n", + "\n", + "def simulate(sim, dt=1.0, get_frames=True):\n", + " # simulate dt seconds at 60Hz to the nearest fixed timestep\n", + " print(\"Simulating \" + str(dt) + \" world seconds.\")\n", + " observations = []\n", + " start_time = sim.get_world_time()\n", + " while sim.get_world_time() < start_time + dt:\n", + " sim.step_physics(1.0 / 60.0)\n", + " if get_frames:\n", + " observations.append(sim.get_sensor_observations())\n", + " return observations\n", + "\n", + "\n", + "# convenience wrapper for simulate and make_video_cv2\n", + "def simulate_and_make_vid(sim, crosshair, prefix, dt=1.0, open_vid=True):\n", + " observations = simulate(sim, dt)\n", + " make_video_cv2(observations, crosshair, prefix=prefix, open_vid=open_vid)\n", + "\n", + "\n", + "def display_sample(\n", + " rgb_obs,\n", + " semantic_obs=np.array([]),\n", + " depth_obs=np.array([]),\n", + " key_points=None, # noqa: B006\n", + "):\n", + " from habitat_sim.utils.common import d3_40_colors_rgb\n", + "\n", + " rgb_img = Image.fromarray(rgb_obs, mode=\"RGB\")\n", + "\n", + " arr = [rgb_img]\n", + " titles = [\"rgb\"]\n", + " if semantic_obs.size != 0:\n", + " semantic_img = Image.new(\n", + " \"P\", (semantic_obs.shape[1], semantic_obs.shape[0])\n", + " )\n", + " semantic_img.putpalette(d3_40_colors_rgb.flatten())\n", + " semantic_img.putdata((semantic_obs.flatten() % 40).astype(np.uint8))\n", + " semantic_img = semantic_img.convert(\"RGBA\")\n", + " arr.append(semantic_img)\n", + " titles.append(\"semantic\")\n", + "\n", + " if depth_obs.size != 0:\n", + " depth_img = Image.fromarray(\n", + " (depth_obs / 10 * 255).astype(np.uint8), mode=\"L\"\n", + " )\n", + " arr.append(depth_img)\n", + " titles.append(\"depth\")\n", + "\n", + " plt.figure(figsize=(12, 8))\n", + " for i, data in enumerate(arr):\n", + " ax = plt.subplot(1, 3, i + 1)\n", + " ax.axis(\"off\")\n", + " ax.set_title(titles[i])\n", + " # plot points on images\n", + " if key_points is not None:\n", + " for point in key_points:\n", + " plt.plot(\n", + " point[0], point[1], marker=\"o\", markersize=10, alpha=0.8\n", + " )\n", + " plt.imshow(data)\n", + "\n", + " plt.show(block=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Setup the Simulator\n", + "\n", + "---\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# @title Setup simulator configuration\n", + "# @markdown We'll start with setting up simulator with the following configurations\n", + "# @markdown - The simulator will render both RGB, Depth observations of 256x256 resolution.\n", + "# @markdown - The actions available will be `move_forward`, `turn_left`, `turn_right`.\n", + "\n", + "\n", + "def make_cfg(settings):\n", + " sim_cfg = habitat_sim.SimulatorConfiguration()\n", + " sim_cfg.gpu_device_id = 0\n", + " sim_cfg.default_agent_id = settings[\"default_agent_id\"]\n", + " sim_cfg.scene_id = settings[\"scene\"]\n", + " sim_cfg.enable_physics = settings[\"enable_physics\"]\n", + " sim_cfg.physics_config_file = settings[\"physics_config_file\"]\n", + "\n", + " # Note: all sensors must have the same resolution\n", + " sensors = {\n", + " \"rgb\": {\n", + " \"sensor_type\": habitat_sim.SensorType.COLOR,\n", + " \"resolution\": [settings[\"height\"], settings[\"width\"]],\n", + " \"position\": [0.0, settings[\"sensor_height\"], 0.0],\n", + " },\n", + " \"depth\": {\n", + " \"sensor_type\": habitat_sim.SensorType.DEPTH,\n", + " \"resolution\": [settings[\"height\"], settings[\"width\"]],\n", + " \"position\": [0.0, settings[\"sensor_height\"], 0.0],\n", + " },\n", + " }\n", + "\n", + " sensor_specs = []\n", + " for sensor_uuid, sensor_params in sensors.items():\n", + " if settings[sensor_uuid]:\n", + " sensor_spec = habitat_sim.SensorSpec()\n", + " sensor_spec.uuid = sensor_uuid\n", + " sensor_spec.sensor_type = sensor_params[\"sensor_type\"]\n", + " sensor_spec.resolution = sensor_params[\"resolution\"]\n", + " sensor_spec.position = sensor_params[\"position\"]\n", + "\n", + " sensor_specs.append(sensor_spec)\n", + "\n", + " # Here you can specify the amount of displacement in a forward action and the turn angle\n", + " agent_cfg = habitat_sim.agent.AgentConfiguration()\n", + " agent_cfg.sensor_specifications = sensor_specs\n", + " agent_cfg.action_space = {\n", + " \"move_forward\": habitat_sim.agent.ActionSpec(\n", + " \"move_forward\", habitat_sim.agent.ActuationSpec(amount=0.1)\n", + " ),\n", + " \"turn_left\": habitat_sim.agent.ActionSpec(\n", + " \"turn_left\", habitat_sim.agent.ActuationSpec(amount=10.0)\n", + " ),\n", + " \"turn_right\": habitat_sim.agent.ActionSpec(\n", + " \"turn_right\", habitat_sim.agent.ActuationSpec(amount=10.0)\n", + " ),\n", + " }\n", + "\n", + " return habitat_sim.Configuration(sim_cfg, [agent_cfg])\n", + "\n", + "\n", + "settings = {\n", + " \"max_frames\": 10,\n", + " \"width\": 256,\n", + " \"height\": 256,\n", + " \"scene\": \"data/scene_datasets/coda/coda.glb\",\n", + " \"default_agent_id\": 0,\n", + " \"sensor_height\": 1.5, # Height of sensors in meters\n", + " \"rgb\": True, # RGB sensor\n", + " \"depth\": True, # Depth sensor\n", + " \"seed\": 1,\n", + " \"enable_physics\": True,\n", + " \"physics_config_file\": \"data/default.physics_config.json\",\n", + " \"silent\": False,\n", + " \"compute_shortest_path\": False,\n", + " \"compute_action_shortest_path\": False,\n", + " \"save_png\": True,\n", + "}\n", + "\n", + "cfg = make_cfg(settings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# @title Spawn the agent at a pre-defined location\n", + "\n", + "\n", + "def init_agent(sim):\n", + " agent_pos = np.array([-0.15776923, 0.18244143, 0.2988735])\n", + "\n", + " # Place the agent\n", + " sim.agents[0].scene_node.translation = agent_pos\n", + " agent_orientation_y = -40\n", + " sim.agents[0].scene_node.rotation = mn.Quaternion.rotation(\n", + " mn.Deg(agent_orientation_y), mn.Vector3(0, 1.0, 0)\n", + " )\n", + "\n", + "\n", + "cfg.sim_cfg.default_agent_id = 0\n", + "with habitat_sim.Simulator(cfg) as sim:\n", + " init_agent(sim)\n", + " if make_video:\n", + " # Visualize the agent's initial position\n", + " simulate_and_make_vid(\n", + " sim, None, \"sim-init\", dt=1.0, open_vid=show_video\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# @title Set the object's initial and final position\n", + "# @markdown Defines two utility functions:\n", + "# @markdown - `remove_all_objects`: This will remove all objects from the scene\n", + "# @markdown - `set_object_in_front_of_agent`: This will add an object in the scene in front of the agent at the specified distance.\n", + "\n", + "# @markdown Here we add a chair *3.0m* away from the agent and the task is to place the agent at the desired final position which is *7.0m* in front of the agent.\n", + "\n", + "\n", + "def remove_all_objects(sim):\n", + " for obj_id in sim.get_existing_object_ids():\n", + " sim.remove_object(obj_id)\n", + "\n", + "\n", + "def set_object_in_front_of_agent(sim, obj_id, z_offset=-1.5):\n", + " r\"\"\"\n", + " Adds an object in front of the agent at some distance.\n", + " \"\"\"\n", + " agent_transform = sim.agents[0].scene_node.transformation_matrix()\n", + " obj_translation = agent_transform.transform_point(\n", + " np.array([0, 0, z_offset])\n", + " )\n", + " sim.set_translation(obj_translation, obj_id)\n", + "\n", + " obj_node = sim.get_object_scene_node(obj_id)\n", + " xform_bb = habitat_sim.geo.get_transformed_bb(\n", + " obj_node.cumulative_bb, obj_node.transformation\n", + " )\n", + "\n", + " # also account for collision margin of the scene\n", + " scene_collision_margin = 0.04\n", + " y_translation = mn.Vector3(\n", + " 0, xform_bb.size_y() / 2.0 + scene_collision_margin, 0\n", + " )\n", + " sim.set_translation(y_translation + sim.get_translation(obj_id), obj_id)\n", + "\n", + "\n", + "def init_objects(sim):\n", + " # Manager of Object Attributes Templates\n", + " obj_attr_mgr = sim.get_object_template_manager()\n", + " obj_attr_mgr.load_configs(\n", + " str(os.path.join(data_path, \"test_assets/objects\"))\n", + " )\n", + "\n", + " # Add a chair into the scene.\n", + " obj_path = \"test_assets/objects/chair\"\n", + " chair_template_id = obj_attr_mgr.load_object_configs(\n", + " str(os.path.join(data_path, obj_path))\n", + " )[0]\n", + " chair_attr = obj_attr_mgr.get_template_by_ID(chair_template_id)\n", + " obj_attr_mgr.register_template(chair_attr)\n", + "\n", + " # Object's initial position 3m away from the agent.\n", + " object_id = sim.add_object_by_handle(chair_attr.handle)\n", + " set_object_in_front_of_agent(sim, object_id, -3.0)\n", + " sim.set_object_motion_type(\n", + " habitat_sim.physics.MotionType.STATIC, object_id\n", + " )\n", + "\n", + " # Object's final position 7m away from the agent\n", + " goal_id = sim.add_object_by_handle(chair_attr.handle)\n", + " set_object_in_front_of_agent(sim, goal_id, -7.0)\n", + " sim.set_object_motion_type(habitat_sim.physics.MotionType.STATIC, goal_id)\n", + "\n", + " return object_id, goal_id\n", + "\n", + "\n", + "with habitat_sim.Simulator(cfg) as sim:\n", + " init_agent(sim)\n", + " init_objects(sim)\n", + "\n", + " # Visualize the scene after the chair is added into the scene.\n", + " if make_video:\n", + " simulate_and_make_vid(\n", + " sim, None, \"object-init\", dt=1.0, open_vid=show_video\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rearrangement Dataset\n", + "![dataset](https://drive.google.com/uc?id=1y0qS0MifmJsZ0F4jsRZGI9BrXzslFLn7)\n", + "\n", + "In the previous section, we created a single episode of the rearrangement task. Let's define a format to store all the necessary information about a single episode. It should store the *scene* the episode belongs to, *initial spawn position and orientation* of the agent, *object type*, object's *initial position and orientation* as well as *final position and orientation*.\n", + "\n", + "The format will be as follows:\n", + "```\n", + "{\n", + " 'episode_id': 0,\n", + " 'scene_id': 'data/scene_datasets/coda/coda.glb',\n", + " 'goals': {\n", + " 'position': [4.34, 0.67, -5.06],\n", + " 'rotation': [0.0, 0.0, 0.0, 1.0]\n", + " },\n", + " 'objects': {\n", + " 'object_id': 0,\n", + " 'object_template': 'data/test_assets/objects/chair',\n", + " 'position': [1.77, 0.67, -1.99],\n", + " 'rotation': [0.0, 0.0, 0.0, 1.0]\n", + " },\n", + " 'start_position': [-0.15, 0.18, 0.29],\n", + " 'start_rotation': [-0.0, -0.34, -0.0, 0.93]}\n", + "}\n", + "```\n", + "Once an episode is defined, a dataset will just be a collection of such episodes. For simplicity, in this notebook, the dataset will only contain one episode defined above.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# @title Create a new dataset\n", + "# @markdown Utility functions to define and save the dataset for the rearrangement task\n", + "\n", + "\n", + "def get_rotation(sim, object_id):\n", + " quat = sim.get_rotation(object_id)\n", + " return np.array(quat.vector).tolist() + [quat.scalar]\n", + "\n", + "\n", + "def init_episode_dict(episode_id, scene_id, agent_pos, agent_rot):\n", + " episode_dict = {\n", + " \"episode_id\": episode_id,\n", + " \"scene_id\": \"data/scene_datasets/coda/coda.glb\",\n", + " \"start_position\": agent_pos,\n", + " \"start_rotation\": agent_rot,\n", + " \"info\": {},\n", + " }\n", + " return episode_dict\n", + "\n", + "\n", + "def add_object_details(sim, episode_dict, obj_id, object_template, object_id):\n", + " object_template = {\n", + " \"object_id\": obj_id,\n", + " \"object_template\": object_template,\n", + " \"position\": np.array(sim.get_translation(object_id)).tolist(),\n", + " \"rotation\": get_rotation(sim, object_id),\n", + " }\n", + " episode_dict[\"objects\"] = object_template\n", + " return episode_dict\n", + "\n", + "\n", + "def add_goal_details(sim, episode_dict, object_id):\n", + " goal_template = {\n", + " \"position\": np.array(sim.get_translation(object_id)).tolist(),\n", + " \"rotation\": get_rotation(sim, object_id),\n", + " }\n", + " episode_dict[\"goals\"] = goal_template\n", + " return episode_dict\n", + "\n", + "\n", + "# set the number of objects to 1 always for now.\n", + "def build_episode(sim, episode_num, object_id, goal_id):\n", + " episodes = {\"episodes\": []}\n", + " for episode in range(episode_num):\n", + " agent_state = sim.get_agent(0).get_state()\n", + " agent_pos = np.array(agent_state.position).tolist()\n", + " agent_quat = agent_state.rotation\n", + " agent_rot = np.array(agent_quat.vec).tolist() + [agent_quat.real]\n", + " episode_dict = init_episode_dict(\n", + " episode, settings[\"scene\"], agent_pos, agent_rot\n", + " )\n", + "\n", + " object_attr = sim.get_object_initialization_template(object_id)\n", + " object_path = os.path.relpath(\n", + " os.path.splitext(object_attr.render_asset_handle)[0]\n", + " )\n", + "\n", + " episode_dict = add_object_details(\n", + " sim, episode_dict, 0, object_path, object_id\n", + " )\n", + " episode_dict = add_goal_details(sim, episode_dict, goal_id)\n", + " episodes[\"episodes\"].append(episode_dict)\n", + "\n", + " return episodes\n", + "\n", + "\n", + "with habitat_sim.Simulator(cfg) as sim:\n", + " init_agent(sim)\n", + " object_id, goal_id = init_objects(sim)\n", + "\n", + " episodes = build_episode(sim, 1, object_id, goal_id)\n", + "\n", + " dataset_content_path = \"data/datasets/rearrangement/coda/v1/train/\"\n", + " if not os.path.exists(dataset_content_path):\n", + " os.makedirs(dataset_content_path)\n", + "\n", + " with gzip.open(\n", + " os.path.join(dataset_content_path, \"train.json.gz\"), \"wt\"\n", + " ) as f:\n", + " json.dump(episodes, f)\n", + "\n", + " print(\n", + " \"Dataset written to {}\".format(\n", + " os.path.join(dataset_content_path, \"train.json.gz\")\n", + " )\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Dataset class to read the saved dataset in Habitat-Lab.\n", + "# @markdown To read the saved episodes in Habitat-Lab, we will extend the `Dataset` class and the `Episode` base class. It will help provide all the relevant details about the episode through a consistent API to all downstream tasks.\n", + "\n", + "# @markdown - We will first create a `RearrangementEpisode` by extending the `NavigationEpisode` to include additional information about object's initial configuration and desired final configuration.\n", + "# @markdown - We will then define a `RearrangementDatasetV0` class that builds on top of `PointNavDatasetV1` class to read the JSON file stored earlier and initialize a list of `RearrangementEpisode`.\n", + "\n", + "from habitat.core.utils import DatasetFloatJSONEncoder, not_none_validator\n", + "from habitat.datasets.pointnav.pointnav_dataset import (\n", + " CONTENT_SCENES_PATH_FIELD,\n", + " DEFAULT_SCENE_PATH_PREFIX,\n", + " PointNavDatasetV1,\n", + ")\n", + "from habitat.tasks.nav.nav import NavigationEpisode\n", + "\n", + "\n", + "@attr.s(auto_attribs=True, kw_only=True)\n", + "class RearrangementSpec:\n", + " r\"\"\"Specifications that capture a particular position of final position\n", + " or initial position of the object.\n", + " \"\"\"\n", + "\n", + " position: List[float] = attr.ib(default=None, validator=not_none_validator)\n", + " rotation: List[float] = attr.ib(default=None, validator=not_none_validator)\n", + " info: Optional[Dict[str, str]] = attr.ib(default=None)\n", + "\n", + "\n", + "@attr.s(auto_attribs=True, kw_only=True)\n", + "class RearrangementObjectSpec(RearrangementSpec):\n", + " r\"\"\"Object specifications that capture position of each object in the scene,\n", + " the associated object template.\n", + " \"\"\"\n", + " object_id: str = attr.ib(default=None, validator=not_none_validator)\n", + " object_template: Optional[str] = attr.ib(\n", + " default=\"data/test_assets/objects/chair\"\n", + " )\n", + "\n", + "\n", + "@attr.s(auto_attribs=True, kw_only=True)\n", + "class RearrangementEpisode(NavigationEpisode):\n", + " r\"\"\"Specification of episode that includes initial position and rotation\n", + " of agent, all goal specifications, all object specifications\n", + "\n", + " Args:\n", + " episode_id: id of episode in the dataset\n", + " scene_id: id of scene inside the simulator.\n", + " start_position: numpy ndarray containing 3 entries for (x, y, z).\n", + " start_rotation: numpy ndarray with 4 entries for (x, y, z, w)\n", + " elements of unit quaternion (versor) representing agent 3D\n", + " orientation.\n", + " goal: object's goal position and rotation\n", + " object: object's start specification defined with object type,\n", + " position, and rotation.\n", + " \"\"\"\n", + " objects: RearrangementObjectSpec = attr.ib(\n", + " default=None, validator=not_none_validator\n", + " )\n", + " goals: RearrangementSpec = attr.ib(\n", + " default=None, validator=not_none_validator\n", + " )\n", + "\n", + "\n", + "@registry.register_dataset(name=\"RearrangementDataset-v0\")\n", + "class RearrangementDatasetV0(PointNavDatasetV1):\n", + " r\"\"\"Class inherited from PointNavDataset that loads Rearrangement dataset.\"\"\"\n", + " episodes: List[RearrangementEpisode]\n", + " content_scenes_path: str = \"{data_path}/content/{scene}.json.gz\"\n", + "\n", + " def to_json(self) -> str:\n", + " result = DatasetFloatJSONEncoder().encode(self)\n", + " return result\n", + "\n", + " def __init__(self, config: Optional[Config] = None) -> None:\n", + " super().__init__(config)\n", + "\n", + " def from_json(\n", + " self, json_str: str, scenes_dir: Optional[str] = None\n", + " ) -> None:\n", + " deserialized = json.loads(json_str)\n", + " if CONTENT_SCENES_PATH_FIELD in deserialized:\n", + " self.content_scenes_path = deserialized[CONTENT_SCENES_PATH_FIELD]\n", + "\n", + " for i, episode in enumerate(deserialized[\"episodes\"]):\n", + " rearrangement_episode = RearrangementEpisode(**episode)\n", + " rearrangement_episode.episode_id = str(i)\n", + "\n", + " if scenes_dir is not None:\n", + " if rearrangement_episode.scene_id.startswith(\n", + " DEFAULT_SCENE_PATH_PREFIX\n", + " ):\n", + " rearrangement_episode.scene_id = (\n", + " rearrangement_episode.scene_id[\n", + " len(DEFAULT_SCENE_PATH_PREFIX) :\n", + " ]\n", + " )\n", + "\n", + " rearrangement_episode.scene_id = os.path.join(\n", + " scenes_dir, rearrangement_episode.scene_id\n", + " )\n", + "\n", + " rearrangement_episode.objects = RearrangementObjectSpec(\n", + " **rearrangement_episode.objects\n", + " )\n", + " rearrangement_episode.goals = RearrangementSpec(\n", + " **rearrangement_episode.goals\n", + " )\n", + "\n", + " self.episodes.append(rearrangement_episode)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# @title Load the saved dataset using the Dataset class\n", + "config = habitat.get_config(\"configs/datasets/pointnav/habitat_test.yaml\")\n", + "config.defrost()\n", + "config.DATASET.DATA_PATH = (\n", + " \"data/datasets/rearrangement/coda/v1/{split}/{split}.json.gz\"\n", + ")\n", + "config.DATASET.TYPE = \"RearrangementDataset-v0\"\n", + "config.freeze()\n", + "\n", + "dataset = RearrangementDatasetV0(config.DATASET)\n", + "\n", + "# check if the dataset got correctly deserialized\n", + "assert len(dataset.episodes) == 1\n", + "\n", + "assert dataset.episodes[0].objects.position == [\n", + " 1.770593523979187,\n", + " 0.6726829409599304,\n", + " -1.9992598295211792,\n", + "]\n", + "assert dataset.episodes[0].objects.rotation == [0.0, 0.0, 0.0, 1.0]\n", + "assert (\n", + " dataset.episodes[0].objects.object_template\n", + " == \"data/test_assets/objects/chair\"\n", + ")\n", + "\n", + "assert dataset.episodes[0].goals.position == [\n", + " 4.3417439460754395,\n", + " 0.6726829409599304,\n", + " -5.0634379386901855,\n", + "]\n", + "assert dataset.episodes[0].goals.rotation == [0.0, 0.0, 0.0, 1.0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implement Grab/Release Action" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title RayCast utility to implement Grab/Release Under Cross-Hair Action\n", + "# @markdown Cast a ray in the direction of crosshair from the camera and check if it collides with another object within a certain distance threshold\n", + "\n", + "\n", + "def raycast(sim, sensor_name, crosshair_pos=(128, 128), max_distance=2.0):\n", + " r\"\"\"Cast a ray in the direction of crosshair and check if it collides\n", + " with another object within a certain distance threshold\n", + " :param sim: Simulator object\n", + " :param sensor_name: name of the visual sensor to be used for raycasting\n", + " :param crosshair_pos: 2D coordiante in the viewport towards which the\n", + " ray will be cast\n", + " :param max_distance: distance threshold beyond which objects won't\n", + " be considered\n", + " \"\"\"\n", + " render_camera = sim._sensors[sensor_name]._sensor_object.render_camera\n", + " center_ray = render_camera.unproject(mn.Vector2i(crosshair_pos))\n", + "\n", + " raycast_results = sim.cast_ray(center_ray, max_distance=max_distance)\n", + "\n", + " closest_object = -1\n", + " closest_dist = 1000.0\n", + " if raycast_results.has_hits():\n", + " for hit in raycast_results.hits:\n", + " if hit.ray_distance < closest_dist:\n", + " closest_dist = hit.ray_distance\n", + " closest_object = hit.object_id\n", + "\n", + " return closest_object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# Test the raycast utility.\n", + "\n", + "with habitat_sim.Simulator(cfg) as sim:\n", + " init_agent(sim)\n", + " obj_attr_mgr = sim.get_object_template_manager()\n", + " obj_attr_mgr.load_configs(\n", + " str(os.path.join(data_path, \"test_assets/objects\"))\n", + " )\n", + " obj_path = \"test_assets/objects/chair\"\n", + " chair_template_id = obj_attr_mgr.load_object_configs(\n", + " str(os.path.join(data_path, obj_path))\n", + " )[0]\n", + " chair_attr = obj_attr_mgr.get_template_by_ID(chair_template_id)\n", + " obj_attr_mgr.register_template(chair_attr)\n", + " object_id = sim.add_object_by_handle(chair_attr.handle)\n", + " print(f\"Chair's object id is {object_id}\")\n", + "\n", + " set_object_in_front_of_agent(sim, object_id, -1.5)\n", + " sim.set_object_motion_type(\n", + " habitat_sim.physics.MotionType.STATIC, object_id\n", + " )\n", + " if make_video:\n", + " # Visualize the agent's initial position\n", + " simulate_and_make_vid(\n", + " sim, [190, 128], \"sim-before-grab\", dt=1.0, open_vid=show_video\n", + " )\n", + "\n", + " # Distance threshold=2 is greater than agent-to-chair distance.\n", + " # Should return chair's object id\n", + " closest_object = raycast(\n", + " sim, \"rgb\", crosshair_pos=[128, 190], max_distance=2.0\n", + " )\n", + " print(f\"Closest Object ID: {closest_object} using 2.0 threshold\")\n", + " assert (\n", + " closest_object == object_id\n", + " ), f\"Could not pick chair with ID: {object_id}\"\n", + "\n", + " # Distance threshold=1 is smaller than agent-to-chair distance .\n", + " # Should return -1\n", + " closest_object = raycast(\n", + " sim, \"rgb\", crosshair_pos=[128, 190], max_distance=1.0\n", + " )\n", + " print(f\"Closest Object ID: {closest_object} using 1.0 threshold\")\n", + " assert closest_object == -1, \"Agent shoud not be able to pick any object\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Define a Grab/Release action and create a new action space.\n", + "# @markdown Each new action is defined by a `ActionSpec` and an `ActuationSpec`. `ActionSpec` is mapping between the action name and its corresponding `ActuationSpec`. `ActuationSpec` contains all the necessary specifications required to define the action.\n", + "\n", + "from habitat.config.default import _C, CN\n", + "from habitat.core.embodied_task import SimulatorTaskAction\n", + "from habitat.sims.habitat_simulator.actions import (\n", + " HabitatSimActions,\n", + " HabitatSimV1ActionSpaceConfiguration,\n", + ")\n", + "from habitat_sim.agent.controls.controls import ActuationSpec\n", + "from habitat_sim.physics import MotionType\n", + "\n", + "\n", + "# @markdown For instance, `GrabReleaseActuationSpec` contains the following:\n", + "# @markdown - `visual_sensor_name` defines which viewport (rgb, depth, etc) to to use to cast the ray.\n", + "# @markdown - `crosshair_pos` stores the position in the viewport through which the ray passes. Any object which intersects with this ray can be grabbed by the agent.\n", + "# @markdown - `amount` defines a distance threshold. Objects which are farther than the treshold cannot be picked up by the agent.\n", + "@attr.s(auto_attribs=True, slots=True)\n", + "class GrabReleaseActuationSpec(ActuationSpec):\n", + " visual_sensor_name: str = \"rgb\"\n", + " crosshair_pos: List[int] = [128, 128]\n", + " amount: float = 2.0\n", + "\n", + "\n", + "# @markdown Then, we extend the `HabitatSimV1ActionSpaceConfiguration` to add the above action into the agent's action space. `ActionSpaceConfiguration` is a mapping between action name and the corresponding `ActionSpec`\n", + "@registry.register_action_space_configuration(name=\"RearrangementActions-v0\")\n", + "class RearrangementSimV0ActionSpaceConfiguration(\n", + " HabitatSimV1ActionSpaceConfiguration\n", + "):\n", + " def __init__(self, config):\n", + " super().__init__(config)\n", + " if not HabitatSimActions.has_action(\"GRAB_RELEASE\"):\n", + " HabitatSimActions.extend_action_space(\"GRAB_RELEASE\")\n", + "\n", + " def get(self):\n", + " config = super().get()\n", + " new_config = {\n", + " HabitatSimActions.GRAB_RELEASE: habitat_sim.ActionSpec(\n", + " \"grab_or_release_object_under_crosshair\",\n", + " GrabReleaseActuationSpec(\n", + " visual_sensor_name=self.config.VISUAL_SENSOR,\n", + " crosshair_pos=self.config.CROSSHAIR_POS,\n", + " amount=self.config.GRAB_DISTANCE,\n", + " ),\n", + " )\n", + " }\n", + "\n", + " config.update(new_config)\n", + "\n", + " return config\n", + "\n", + "\n", + "# @markdown Finally, we extend `SimualtorTaskAction` which tells the simulator which action to call when a named action ('GRAB_RELEASE' in this case) is predicte by the agent's policy.\n", + "@registry.register_task_action\n", + "class GrabOrReleaseAction(SimulatorTaskAction):\n", + " def step(self, *args: Any, **kwargs: Any):\n", + " r\"\"\"This method is called from ``Env`` on each ``step``.\"\"\"\n", + " return self._sim.step(HabitatSimActions.GRAB_RELEASE)\n", + "\n", + "\n", + "_C.TASK.ACTIONS.GRAB_RELEASE = CN()\n", + "_C.TASK.ACTIONS.GRAB_RELEASE.TYPE = \"GrabOrReleaseAction\"\n", + "_C.SIMULATOR.CROSSHAIR_POS = [128, 160]\n", + "_C.SIMULATOR.GRAB_DISTANCE = 2.0\n", + "_C.SIMULATOR.VISUAL_SENSOR = \"rgb\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##Setup Simulator Class for Rearrangement Task\n", + "\n", + "![sim](https://drive.google.com/uc?id=1ce6Ti-gpumMEyfomqAKWqOspXm6tN4_8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title RearrangementSim Class\n", + "# @markdown Here we will extend the `HabitatSim` class for the rearrangement task. We will make the following changes:\n", + "# @markdown - define a new `_initialize_objects` function which will load the object in its initial configuration as defined by the episode.\n", + "# @markdown - define a `gripped_object_id` property that stores whether the agent is holding any object or not.\n", + "# @markdown - modify the `step` function of the simulator to use the `grab/release` action we define earlier.\n", + "\n", + "# @markdown #### Writing the `step` function:\n", + "# @markdown Since we added a new action for this task, we have to modify the `step` function to define what happens when `grab/release` action is called. If a simple navigation action (`move_forward`, `turn_left`, `turn_right`) is called, we pass it forward to `act` function of the agent which already defines the behavior of these actions.\n", + "\n", + "# @markdown For the `grab/release` action, if the agent is not already holding an object, we first call the `raycast` function using the values from the `ActuationSpec` to see if any object is grippable. If it returns a valid object id, we put the object in a \"invisible\" inventory and remove it from the scene.\n", + "\n", + "# @markdown If the agent was already holding an object, `grab/release` action will try release the object at the same relative position as it was grabbed. If the object can be placed without any collision, then the `release` action is successful.\n", + "\n", + "from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim\n", + "from habitat_sim.nav import NavMeshSettings\n", + "from habitat_sim.utils.common import quat_from_coeffs, quat_to_magnum\n", + "\n", + "\n", + "@registry.register_simulator(name=\"RearrangementSim-v0\")\n", + "class RearrangementSim(HabitatSim):\n", + " r\"\"\"Simulator wrapper over habitat-sim with\n", + " object rearrangement functionalities.\n", + " \"\"\"\n", + "\n", + " def __init__(self, config: Config) -> None:\n", + " self.did_reset = False\n", + " super().__init__(config=config)\n", + " self.grip_offset = np.eye(4)\n", + "\n", + " agent_id = self.habitat_config.DEFAULT_AGENT_ID\n", + " agent_config = self._get_agent_config(agent_id)\n", + "\n", + " self.navmesh_settings = NavMeshSettings()\n", + " self.navmesh_settings.set_defaults()\n", + " self.navmesh_settings.agent_radius = agent_config.RADIUS\n", + " self.navmesh_settings.agent_height = agent_config.HEIGHT\n", + "\n", + " def reconfigure(self, config: Config) -> None:\n", + " super().reconfigure(config)\n", + " self._initialize_objects()\n", + "\n", + " def reset(self):\n", + " sim_obs = super().reset()\n", + " if self._update_agents_state():\n", + " sim_obs = self.get_sensor_observations()\n", + "\n", + " self._prev_sim_obs = sim_obs\n", + " self.did_reset = True\n", + " self.grip_offset = np.eye(4)\n", + " return self._sensor_suite.get_observations(sim_obs)\n", + "\n", + " def _initialize_objects(self):\n", + " objects = self.habitat_config.objects[0]\n", + " obj_attr_mgr = self.get_object_template_manager()\n", + " obj_attr_mgr.load_configs(\n", + " str(os.path.join(data_path, \"test_assets/objects\"))\n", + " )\n", + " # first remove all existing objects\n", + " existing_object_ids = self.get_existing_object_ids()\n", + "\n", + " if len(existing_object_ids) > 0:\n", + " for obj_id in existing_object_ids:\n", + " self.remove_object(obj_id)\n", + "\n", + " self.sim_object_to_objid_mapping = {}\n", + " self.objid_to_sim_object_mapping = {}\n", + "\n", + " if objects is not None:\n", + " object_template = objects[\"object_template\"]\n", + " object_pos = objects[\"position\"]\n", + " object_rot = objects[\"rotation\"]\n", + "\n", + " object_template_id = obj_attr_mgr.load_object_configs(\n", + " object_template\n", + " )[0]\n", + " object_attr = obj_attr_mgr.get_template_by_ID(object_template_id)\n", + " obj_attr_mgr.register_template(object_attr)\n", + "\n", + " object_id = self.add_object_by_handle(object_attr.handle)\n", + " self.sim_object_to_objid_mapping[object_id] = objects[\"object_id\"]\n", + " self.objid_to_sim_object_mapping[objects[\"object_id\"]] = object_id\n", + "\n", + " self.set_translation(object_pos, object_id)\n", + " if isinstance(object_rot, list):\n", + " object_rot = quat_from_coeffs(object_rot)\n", + "\n", + " object_rot = quat_to_magnum(object_rot)\n", + " self.set_rotation(object_rot, object_id)\n", + "\n", + " self.set_object_motion_type(MotionType.STATIC, object_id)\n", + "\n", + " # Recompute the navmesh after placing all the objects.\n", + " self.recompute_navmesh(self.pathfinder, self.navmesh_settings, True)\n", + "\n", + " def _sync_gripped_object(self, gripped_object_id):\n", + " r\"\"\"\n", + " Sync the gripped object with the object associated with the agent.\n", + " \"\"\"\n", + " if gripped_object_id != -1:\n", + " agent_body_transformation = (\n", + " self._default_agent.scene_node.transformation\n", + " )\n", + " self.set_transformation(\n", + " agent_body_transformation, gripped_object_id\n", + " )\n", + " translation = agent_body_transformation.transform_point(\n", + " np.array([0, 2.0, 0])\n", + " )\n", + " self.set_translation(translation, gripped_object_id)\n", + "\n", + " @property\n", + " def gripped_object_id(self):\n", + " return self._prev_sim_obs.get(\"gripped_object_id\", -1)\n", + "\n", + " def step(self, action: int):\n", + " dt = 1 / 60.0\n", + " self._num_total_frames += 1\n", + " collided = False\n", + " gripped_object_id = self.gripped_object_id\n", + "\n", + " agent_config = self._default_agent.agent_config\n", + " action_spec = agent_config.action_space[action]\n", + "\n", + " if action_spec.name == \"grab_or_release_object_under_crosshair\":\n", + " # If already holding an agent\n", + " if gripped_object_id != -1:\n", + " agent_body_transformation = (\n", + " self._default_agent.scene_node.transformation\n", + " )\n", + " T = np.dot(agent_body_transformation, self.grip_offset)\n", + "\n", + " self.set_transformation(T, gripped_object_id)\n", + "\n", + " position = self.get_translation(gripped_object_id)\n", + "\n", + " if self.pathfinder.is_navigable(position):\n", + " self.set_object_motion_type(\n", + " MotionType.STATIC, gripped_object_id\n", + " )\n", + " gripped_object_id = -1\n", + " self.recompute_navmesh(\n", + " self.pathfinder, self.navmesh_settings, True\n", + " )\n", + " # if not holding an object, then try to grab\n", + " else:\n", + " gripped_object_id = raycast(\n", + " self,\n", + " action_spec.actuation.visual_sensor_name,\n", + " crosshair_pos=action_spec.actuation.crosshair_pos,\n", + " max_distance=action_spec.actuation.amount,\n", + " )\n", + "\n", + " # found a grabbable object.\n", + " if gripped_object_id != -1:\n", + " agent_body_transformation = (\n", + " self._default_agent.scene_node.transformation\n", + " )\n", + "\n", + " self.grip_offset = np.dot(\n", + " np.array(agent_body_transformation.inverted()),\n", + " np.array(self.get_transformation(gripped_object_id)),\n", + " )\n", + " self.set_object_motion_type(\n", + " MotionType.KINEMATIC, gripped_object_id\n", + " )\n", + " self.recompute_navmesh(\n", + " self.pathfinder, self.navmesh_settings, True\n", + " )\n", + "\n", + " else:\n", + " collided = self._default_agent.act(action)\n", + " self._last_state = self._default_agent.get_state()\n", + "\n", + " # step physics by dt\n", + " super().step_world(dt)\n", + "\n", + " # Sync the gripped object after the agent moves.\n", + " self._sync_gripped_object(gripped_object_id)\n", + "\n", + " # obtain observations\n", + " self._prev_sim_obs = self.get_sensor_observations()\n", + " self._prev_sim_obs[\"collided\"] = collided\n", + " self._prev_sim_obs[\"gripped_object_id\"] = gripped_object_id\n", + "\n", + " observations = self._sensor_suite.get_observations(self._prev_sim_obs)\n", + " return observations" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create the Rearrangement Task\n", + "![task](https://drive.google.com/uc?id=1N75Mmi6aigh33uL765ljsAqLzFmcs7Zn)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Implement new sensors and measurements\n", + "# @markdown After defining the dataset, action space and simulator functions for the rearrangement task, we are one step closer to training agents to solve this task.\n", + "\n", + "# @markdown Here we define inputs to the policy and other measurements required to design reward functions.\n", + "\n", + "# @markdown **Sensors**: These define various part of the simulator state that's visible to the agent. For simplicity, we'll assume that agent knows the object's current position, object's final goal position relative to the agent's current position.\n", + "# @markdown - Object's current position will be made given by the `ObjectPosition` sensor\n", + "# @markdown - Object's goal position will be available through the `ObjectGoal` sensor.\n", + "# @markdown - Finally, we will also use `GrippedObject` sensor to tell the agent if it's holding any object or not.\n", + "\n", + "# @markdown **Measures**: These define various metrics about the task which can be used to measure task progress and define rewards. Note that measurements are *privileged* information not accessible to the agent as part of the observation space. We will need the following measurements:\n", + "# @markdown - `AgentToObjectDistance` which measure the euclidean distance between the agent and the object.\n", + "# @markdown - `ObjectToGoalDistance` which measures the euclidean distance between the object and the goal.\n", + "\n", + "from gym import spaces\n", + "\n", + "import habitat_sim\n", + "from habitat.config.default import CN, Config\n", + "from habitat.core.dataset import Episode\n", + "from habitat.core.embodied_task import Measure\n", + "from habitat.core.simulator import Observations, Sensor, SensorTypes, Simulator\n", + "from habitat.tasks.nav.nav import PointGoalSensor\n", + "\n", + "\n", + "@registry.register_sensor\n", + "class GrippedObjectSensor(Sensor):\n", + " cls_uuid = \"gripped_object_id\"\n", + "\n", + " def __init__(\n", + " self, *args: Any, sim: RearrangementSim, config: Config, **kwargs: Any\n", + " ):\n", + " self._sim = sim\n", + " super().__init__(config=config)\n", + "\n", + " def _get_uuid(self, *args: Any, **kwargs: Any) -> str:\n", + " return self.cls_uuid\n", + "\n", + " def _get_observation_space(self, *args: Any, **kwargs: Any):\n", + "\n", + " return spaces.Discrete(len(self._sim.get_existing_object_ids()))\n", + "\n", + " def _get_sensor_type(self, *args: Any, **kwargs: Any):\n", + " return SensorTypes.MEASUREMENT\n", + "\n", + " def get_observation(\n", + " self,\n", + " observations: Dict[str, Observations],\n", + " episode: Episode,\n", + " *args: Any,\n", + " **kwargs: Any,\n", + " ):\n", + " obj_id = self._sim.sim_object_to_objid_mapping.get(\n", + " self._sim.gripped_object_id, -1\n", + " )\n", + " return obj_id\n", + "\n", + "\n", + "@registry.register_sensor\n", + "class ObjectPosition(PointGoalSensor):\n", + " cls_uuid: str = \"object_position\"\n", + "\n", + " def _get_observation_space(self, *args: Any, **kwargs: Any):\n", + " sensor_shape = (self._dimensionality,)\n", + "\n", + " return spaces.Box(\n", + " low=np.finfo(np.float32).min,\n", + " high=np.finfo(np.float32).max,\n", + " shape=sensor_shape,\n", + " dtype=np.float32,\n", + " )\n", + "\n", + " def get_observation(\n", + " self, *args: Any, observations, episode, **kwargs: Any\n", + " ):\n", + " agent_state = self._sim.get_agent_state()\n", + " agent_position = agent_state.position\n", + " rotation_world_agent = agent_state.rotation\n", + "\n", + " object_id = self._sim.get_existing_object_ids()[0]\n", + " object_position = self._sim.get_translation(object_id)\n", + " pointgoal = self._compute_pointgoal(\n", + " agent_position, rotation_world_agent, object_position\n", + " )\n", + " return pointgoal\n", + "\n", + "\n", + "@registry.register_sensor\n", + "class ObjectGoal(PointGoalSensor):\n", + " cls_uuid: str = \"object_goal\"\n", + "\n", + " def _get_observation_space(self, *args: Any, **kwargs: Any):\n", + " sensor_shape = (self._dimensionality,)\n", + "\n", + " return spaces.Box(\n", + " low=np.finfo(np.float32).min,\n", + " high=np.finfo(np.float32).max,\n", + " shape=sensor_shape,\n", + " dtype=np.float32,\n", + " )\n", + "\n", + " def get_observation(\n", + " self, *args: Any, observations, episode, **kwargs: Any\n", + " ):\n", + " agent_state = self._sim.get_agent_state()\n", + " agent_position = agent_state.position\n", + " rotation_world_agent = agent_state.rotation\n", + "\n", + " goal_position = np.array(episode.goals.position, dtype=np.float32)\n", + "\n", + " point_goal = self._compute_pointgoal(\n", + " agent_position, rotation_world_agent, goal_position\n", + " )\n", + " return point_goal\n", + "\n", + "\n", + "@registry.register_measure\n", + "class ObjectToGoalDistance(Measure):\n", + " \"\"\"The measure calculates distance of object towards the goal.\"\"\"\n", + "\n", + " cls_uuid: str = \"object_to_goal_distance\"\n", + "\n", + " def __init__(\n", + " self, sim: Simulator, config: Config, *args: Any, **kwargs: Any\n", + " ):\n", + " self._sim = sim\n", + " self._config = config\n", + "\n", + " super().__init__(**kwargs)\n", + "\n", + " @staticmethod\n", + " def _get_uuid(*args: Any, **kwargs: Any):\n", + " return ObjectToGoalDistance.cls_uuid\n", + "\n", + " def reset_metric(self, episode, *args: Any, **kwargs: Any):\n", + " self.update_metric(*args, episode=episode, **kwargs)\n", + "\n", + " def _geo_dist(self, src_pos, goal_pos: np.array) -> float:\n", + " return self._sim.geodesic_distance(src_pos, [goal_pos])\n", + "\n", + " def _euclidean_distance(self, position_a, position_b):\n", + " return np.linalg.norm(\n", + " np.array(position_b) - np.array(position_a), ord=2\n", + " )\n", + "\n", + " def update_metric(self, episode, *args: Any, **kwargs: Any):\n", + " sim_obj_id = self._sim.get_existing_object_ids()[0]\n", + "\n", + " previous_position = np.array(\n", + " self._sim.get_translation(sim_obj_id)\n", + " ).tolist()\n", + " goal_position = episode.goals.position\n", + " self._metric = self._euclidean_distance(\n", + " previous_position, goal_position\n", + " )\n", + "\n", + "\n", + "@registry.register_measure\n", + "class AgentToObjectDistance(Measure):\n", + " \"\"\"The measure calculates the distance of objects from the agent\"\"\"\n", + "\n", + " cls_uuid: str = \"agent_to_object_distance\"\n", + "\n", + " def __init__(\n", + " self, sim: Simulator, config: Config, *args: Any, **kwargs: Any\n", + " ):\n", + " self._sim = sim\n", + " self._config = config\n", + "\n", + " super().__init__(**kwargs)\n", + "\n", + " @staticmethod\n", + " def _get_uuid(*args: Any, **kwargs: Any):\n", + " return AgentToObjectDistance.cls_uuid\n", + "\n", + " def reset_metric(self, episode, *args: Any, **kwargs: Any):\n", + " self.update_metric(*args, episode=episode, **kwargs)\n", + "\n", + " def _euclidean_distance(self, position_a, position_b):\n", + " return np.linalg.norm(\n", + " np.array(position_b) - np.array(position_a), ord=2\n", + " )\n", + "\n", + " def update_metric(self, episode, *args: Any, **kwargs: Any):\n", + " sim_obj_id = self._sim.get_existing_object_ids()[0]\n", + " previous_position = np.array(\n", + " self._sim.get_translation(sim_obj_id)\n", + " ).tolist()\n", + "\n", + " agent_state = self._sim.get_agent_state()\n", + " agent_position = agent_state.position\n", + "\n", + " self._metric = self._euclidean_distance(\n", + " previous_position, agent_position\n", + " )\n", + "\n", + "\n", + "# -----------------------------------------------------------------------------\n", + "# # REARRANGEMENT TASK GRIPPED OBJECT SENSOR\n", + "# -----------------------------------------------------------------------------\n", + "_C.TASK.GRIPPED_OBJECT_SENSOR = CN()\n", + "_C.TASK.GRIPPED_OBJECT_SENSOR.TYPE = \"GrippedObjectSensor\"\n", + "# -----------------------------------------------------------------------------\n", + "# # REARRANGEMENT TASK ALL OBJECT POSITIONS SENSOR\n", + "# -----------------------------------------------------------------------------\n", + "_C.TASK.OBJECT_POSITION = CN()\n", + "_C.TASK.OBJECT_POSITION.TYPE = \"ObjectPosition\"\n", + "_C.TASK.OBJECT_POSITION.GOAL_FORMAT = \"POLAR\"\n", + "_C.TASK.OBJECT_POSITION.DIMENSIONALITY = 2\n", + "# -----------------------------------------------------------------------------\n", + "# # REARRANGEMENT TASK ALL OBJECT GOALS SENSOR\n", + "# -----------------------------------------------------------------------------\n", + "_C.TASK.OBJECT_GOAL = CN()\n", + "_C.TASK.OBJECT_GOAL.TYPE = \"ObjectGoal\"\n", + "_C.TASK.OBJECT_GOAL.GOAL_FORMAT = \"POLAR\"\n", + "_C.TASK.OBJECT_GOAL.DIMENSIONALITY = 2\n", + "# -----------------------------------------------------------------------------\n", + "# # OBJECT_DISTANCE_TO_GOAL MEASUREMENT\n", + "# -----------------------------------------------------------------------------\n", + "_C.TASK.OBJECT_TO_GOAL_DISTANCE = CN()\n", + "_C.TASK.OBJECT_TO_GOAL_DISTANCE.TYPE = \"ObjectToGoalDistance\"\n", + "# -----------------------------------------------------------------------------\n", + "# # OBJECT_DISTANCE_FROM_AGENT MEASUREMENT\n", + "# -----------------------------------------------------------------------------\n", + "_C.TASK.AGENT_TO_OBJECT_DISTANCE = CN()\n", + "_C.TASK.AGENT_TO_OBJECT_DISTANCE.TYPE = \"AgentToObjectDistance\"\n", + "\n", + "from habitat.config.default import CN, Config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Define `RearrangementTask` by extending `NavigationTask`\n", + "from habitat.tasks.nav.nav import NavigationTask, merge_sim_episode_config\n", + "\n", + "\n", + "def merge_sim_episode_with_object_config(\n", + " sim_config: Config, episode: Type[Episode]\n", + ") -> Any:\n", + " sim_config = merge_sim_episode_config(sim_config, episode)\n", + " sim_config.defrost()\n", + " sim_config.objects = [episode.objects.__dict__]\n", + " sim_config.freeze()\n", + "\n", + " return sim_config\n", + "\n", + "\n", + "@registry.register_task(name=\"RearrangementTask-v0\")\n", + "class RearrangementTask(NavigationTask):\n", + " r\"\"\"Embodied Rearrangement Task\n", + " Goal: An agent must place objects at their corresponding goal position.\n", + " \"\"\"\n", + "\n", + " def __init__(self, **kwargs) -> None:\n", + " super().__init__(**kwargs)\n", + "\n", + " def overwrite_sim_config(self, sim_config, episode):\n", + " return merge_sim_episode_with_object_config(sim_config, episode)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Implement a hard-coded and an RL agent\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Load the `RearrangementTask` in Habitat-Lab and run a hard-coded agent\n", + "import habitat\n", + "\n", + "config = habitat.get_config(\"configs/tasks/pointnav.yaml\")\n", + "config.defrost()\n", + "config.ENVIRONMENT.MAX_EPISODE_STEPS = 50\n", + "config.SIMULATOR.TYPE = \"RearrangementSim-v0\"\n", + "config.SIMULATOR.ACTION_SPACE_CONFIG = \"RearrangementActions-v0\"\n", + "config.SIMULATOR.GRAB_DISTANCE = 2.0\n", + "config.SIMULATOR.HABITAT_SIM_V0.ENABLE_PHYSICS = True\n", + "config.TASK.TYPE = \"RearrangementTask-v0\"\n", + "config.TASK.SUCCESS_DISTANCE = 1.0\n", + "config.TASK.SENSORS = [\n", + " \"GRIPPED_OBJECT_SENSOR\",\n", + " \"OBJECT_POSITION\",\n", + " \"OBJECT_GOAL\",\n", + "]\n", + "config.TASK.GOAL_SENSOR_UUID = \"object_goal\"\n", + "config.TASK.MEASUREMENTS = [\n", + " \"OBJECT_TO_GOAL_DISTANCE\",\n", + " \"AGENT_TO_OBJECT_DISTANCE\",\n", + "]\n", + "config.TASK.POSSIBLE_ACTIONS = [\"STOP\", \"MOVE_FORWARD\", \"GRAB_RELEASE\"]\n", + "config.DATASET.TYPE = \"RearrangementDataset-v0\"\n", + "config.DATASET.SPLIT = \"train\"\n", + "config.DATASET.DATA_PATH = (\n", + " \"data/datasets/rearrangement/coda/v1/{split}/{split}.json.gz\"\n", + ")\n", + "config.freeze()\n", + "\n", + "\n", + "def print_info(obs, metrics):\n", + " print(\n", + " \"Gripped Object: {}, Distance To Object: {}, Distance To Goal: {}\".format(\n", + " obs[\"gripped_object_id\"],\n", + " metrics[\"agent_to_object_distance\"],\n", + " metrics[\"object_to_goal_distance\"],\n", + " )\n", + " )\n", + "\n", + "\n", + "try: # Got to make initialization idiot proof\n", + " sim.close()\n", + "except NameError:\n", + " pass\n", + "\n", + "with habitat.Env(config) as env:\n", + " obs = env.reset()\n", + " obs_list = []\n", + " # Get closer to the object\n", + " while True:\n", + " obs = env.step(1)\n", + " obs_list.append(obs)\n", + " metrics = env.get_metrics()\n", + " print_info(obs, metrics)\n", + " if metrics[\"agent_to_object_distance\"] < 2.0:\n", + " break\n", + "\n", + " # Grab the object\n", + " obs = env.step(2)\n", + " obs_list.append(obs)\n", + " metrics = env.get_metrics()\n", + " print_info(obs, metrics)\n", + " assert obs[\"gripped_object_id\"] != -1\n", + "\n", + " # Get closer to the goal\n", + " while True:\n", + " obs = env.step(1)\n", + " obs_list.append(obs)\n", + " metrics = env.get_metrics()\n", + " print_info(obs, metrics)\n", + " if metrics[\"object_to_goal_distance\"] < 2.0:\n", + " break\n", + "\n", + " # Release the object\n", + " obs = env.step(2)\n", + " obs_list.append(obs)\n", + " metrics = env.get_metrics()\n", + " print_info(obs, metrics)\n", + " assert obs[\"gripped_object_id\"] == -1\n", + "\n", + " if make_video:\n", + " make_video_cv2(\n", + " obs_list,\n", + " [190, 128],\n", + " \"hard-coded-agent\",\n", + " fps=5.0,\n", + " open_vid=show_video,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Create a task specific RL Environment with a new reward definition.\n", + "# @markdown We create a `RearragenmentRLEnv` class and modify the `get_reward()` function.\n", + "# @markdown The reward sturcture is as follows:\n", + "# @markdown - The agent gets a positive reward if the agent gets closer to the object otherwise a negative reward.\n", + "# @markdown - The agent gets a positive reward if it moves the object closer to goal otherwise a negative reward.\n", + "# @markdown - The agent gets a positive reward when the agent \"picks\" up an object for the first time. For all other \"grab/release\" action, it gets a negative reward.\n", + "# @markdown - The agent gets a slack penalty of -0.01 for every action it takes in the environment.\n", + "# @markdown - Finally the agent gets a large success reward when the episode is completed successfully.\n", + "\n", + "from typing import Optional, Type\n", + "\n", + "import numpy as np\n", + "\n", + "import habitat\n", + "from habitat import Config, Dataset\n", + "from habitat_baselines.common.baseline_registry import baseline_registry\n", + "from habitat_baselines.common.environments import NavRLEnv\n", + "\n", + "\n", + "@baseline_registry.register_env(name=\"RearrangementRLEnv\")\n", + "class RearrangementRLEnv(NavRLEnv):\n", + " def __init__(self, config: Config, dataset: Optional[Dataset] = None):\n", + " self._prev_measure = {\n", + " \"agent_to_object_distance\": 0.0,\n", + " \"object_to_goal_distance\": 0.0,\n", + " \"gripped_object_id\": -1,\n", + " \"gripped_object_count\": 0,\n", + " }\n", + "\n", + " super().__init__(config, dataset)\n", + "\n", + " self._success_distance = self._core_env_config.TASK.SUCCESS_DISTANCE\n", + "\n", + " def reset(self):\n", + " self._previous_action = None\n", + " observations = super().reset()\n", + "\n", + " self._prev_measure.update(self.habitat_env.get_metrics())\n", + " self._prev_measure[\"gripped_object_id\"] = -1\n", + " self._prev_measure[\"gripped_object_count\"] = 0\n", + "\n", + " return observations\n", + "\n", + " def step(self, *args, **kwargs):\n", + " self._previous_action = kwargs[\"action\"]\n", + " return super().step(*args, **kwargs)\n", + "\n", + " def get_reward_range(self):\n", + " return (\n", + " self._rl_config.SLACK_REWARD - 1.0,\n", + " self._rl_config.SUCCESS_REWARD + 1.0,\n", + " )\n", + "\n", + " def get_reward(self, observations):\n", + " reward = self._rl_config.SLACK_REWARD\n", + " gripped_success_reward = 0.0\n", + " episode_success_reward = 0.0\n", + " agent_to_object_dist_reward = 0.0\n", + " object_to_goal_dist_reward = 0.0\n", + "\n", + " action_name = self._env.task.get_action_name(\n", + " self._previous_action[\"action\"]\n", + " )\n", + "\n", + " # If object grabbed, add a success reward\n", + " # The reward gets awarded only once for an object.\n", + " if (\n", + " action_name == \"GRAB_RELEASE\"\n", + " and observations[\"gripped_object_id\"] >= 0\n", + " ):\n", + " obj_id = observations[\"gripped_object_id\"]\n", + " self._prev_measure[\"gripped_object_count\"] += 1\n", + "\n", + " gripped_success_reward = (\n", + " self._rl_config.GRIPPED_SUCCESS_REWARD\n", + " if self._prev_measure[\"gripped_object_count\"] == 1\n", + " else 0.0\n", + " )\n", + " # add a penalty everytime grab/action is called and doesn't do anything\n", + " elif action_name == \"GRAB_RELEASE\":\n", + " gripped_success_reward += -0.1\n", + "\n", + " self._prev_measure[\"gripped_object_id\"] = observations[\n", + " \"gripped_object_id\"\n", + " ]\n", + "\n", + " # If the action is not a grab/release action, and the agent\n", + " # has not picked up an object, then give reward based on agent to\n", + " # object distance.\n", + " if (\n", + " action_name != \"GRAB_RELEASE\"\n", + " and self._prev_measure[\"gripped_object_id\"] == -1\n", + " ):\n", + " agent_to_object_dist_reward = self.get_agent_to_object_dist_reward(\n", + " observations\n", + " )\n", + "\n", + " # If the action is not a grab/release action, and the agent\n", + " # has picked up an object, then give reward based on object to\n", + " # to goal distance.\n", + " if (\n", + " action_name != \"GRAB_RELEASE\"\n", + " and self._prev_measure[\"gripped_object_id\"] != -1\n", + " ):\n", + " object_to_goal_dist_reward = self.get_object_to_goal_dist_reward()\n", + "\n", + " if (\n", + " self._episode_success(observations)\n", + " and self._prev_measure[\"gripped_object_id\"] == -1\n", + " and action_name == \"STOP\"\n", + " ):\n", + " episode_success_reward = self._rl_config.SUCCESS_REWARD\n", + "\n", + " reward += (\n", + " agent_to_object_dist_reward\n", + " + object_to_goal_dist_reward\n", + " + gripped_success_reward\n", + " + episode_success_reward\n", + " )\n", + "\n", + " return reward\n", + "\n", + " def get_agent_to_object_dist_reward(self, observations):\n", + " \"\"\"\n", + " Encourage the agent to move towards the closest object which is not already in place.\n", + " \"\"\"\n", + " curr_metric = self._env.get_metrics()[\"agent_to_object_distance\"]\n", + " prev_metric = self._prev_measure[\"agent_to_object_distance\"]\n", + " dist_reward = prev_metric - curr_metric\n", + "\n", + " self._prev_measure[\"agent_to_object_distance\"] = curr_metric\n", + "\n", + " return dist_reward\n", + "\n", + " def get_object_to_goal_dist_reward(self):\n", + " curr_metric = self._env.get_metrics()[\"object_to_goal_distance\"]\n", + " prev_metric = self._prev_measure[\"object_to_goal_distance\"]\n", + " dist_reward = prev_metric - curr_metric\n", + "\n", + " self._prev_measure[\"object_to_goal_distance\"] = curr_metric\n", + "\n", + " return dist_reward\n", + "\n", + " def _episode_success(self, observations):\n", + " r\"\"\"Returns True if object is within distance threshold of the goal.\"\"\"\n", + " dist = self._env.get_metrics()[\"object_to_goal_distance\"]\n", + " if (\n", + " abs(dist) > self._success_distance\n", + " or observations[\"gripped_object_id\"] != -1\n", + " ):\n", + " return False\n", + " return True\n", + "\n", + " def _gripped_success(self, observations):\n", + " if (\n", + " observations[\"gripped_object_id\"] >= 0\n", + " and observations[\"gripped_object_id\"]\n", + " != self._prev_measure[\"gripped_object_id\"]\n", + " ):\n", + " return True\n", + "\n", + " return False\n", + "\n", + " def get_done(self, observations):\n", + " done = False\n", + " action_name = self._env.task.get_action_name(\n", + " self._previous_action[\"action\"]\n", + " )\n", + " if self._env.episode_over or (\n", + " self._episode_success(observations)\n", + " and self._prev_measure[\"gripped_object_id\"] == -1\n", + " and action_name == \"STOP\"\n", + " ):\n", + " done = True\n", + " return done\n", + "\n", + " def get_info(self, observations):\n", + " info = self.habitat_env.get_metrics()\n", + " info[\"episode_success\"] = self._episode_success(observations)\n", + " return info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import time\n", + "from typing import Any, Dict, List, Optional\n", + "\n", + "import numpy as np\n", + "from torch.optim.lr_scheduler import LambdaLR\n", + "\n", + "from habitat import Config, logger\n", + "from habitat.utils.visualizations.utils import observations_to_image\n", + "from habitat_baselines.common.baseline_registry import baseline_registry\n", + "from habitat_baselines.common.environments import get_env_class\n", + "from habitat_baselines.common.tensorboard_utils import TensorboardWriter\n", + "from habitat_baselines.rl.models.rnn_state_encoder import (\n", + " build_rnn_state_encoder,\n", + ")\n", + "from habitat_baselines.rl.ppo import PPO\n", + "from habitat_baselines.rl.ppo.policy import Net, Policy\n", + "from habitat_baselines.rl.ppo.ppo_trainer import PPOTrainer\n", + "from habitat_baselines.utils.common import batch_obs, generate_video\n", + "from habitat_baselines.utils.env_utils import make_env_fn\n", + "\n", + "\n", + "def construct_envs(\n", + " config,\n", + " env_class,\n", + " workers_ignore_signals=False,\n", + "):\n", + " r\"\"\"Create VectorEnv object with specified config and env class type.\n", + " To allow better performance, dataset are split into small ones for\n", + " each individual env, grouped by scenes.\n", + "\n", + " :param config: configs that contain num_processes as well as information\n", + " :param necessary to create individual environments.\n", + " :param env_class: class type of the envs to be created.\n", + " :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor\n", + "\n", + " :return: VectorEnv object created according to specification.\n", + " \"\"\"\n", + "\n", + " num_processes = config.NUM_ENVIRONMENTS\n", + " configs = []\n", + " env_classes = [env_class for _ in range(num_processes)]\n", + " dataset = habitat.datasets.make_dataset(config.TASK_CONFIG.DATASET.TYPE)\n", + " scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES\n", + " if \"*\" in config.TASK_CONFIG.DATASET.CONTENT_SCENES:\n", + " scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET)\n", + "\n", + " if num_processes > 1:\n", + " if len(scenes) == 0:\n", + " raise RuntimeError(\n", + " \"No scenes to load, multiple process logic relies on being able to split scenes uniquely between processes\"\n", + " )\n", + "\n", + " if len(scenes) < num_processes:\n", + " scenes = scenes * num_processes\n", + "\n", + " random.shuffle(scenes)\n", + "\n", + " scene_splits = [[] for _ in range(num_processes)]\n", + " for idx, scene in enumerate(scenes):\n", + " scene_splits[idx % len(scene_splits)].append(scene)\n", + "\n", + " assert sum(map(len, scene_splits)) == len(scenes)\n", + "\n", + " for i in range(num_processes):\n", + " proc_config = config.clone()\n", + " proc_config.defrost()\n", + "\n", + " task_config = proc_config.TASK_CONFIG\n", + " task_config.SEED = task_config.SEED + i\n", + " if len(scenes) > 0:\n", + " task_config.DATASET.CONTENT_SCENES = scene_splits[i]\n", + "\n", + " task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = (\n", + " config.SIMULATOR_GPU_ID\n", + " )\n", + "\n", + " task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS\n", + "\n", + " proc_config.freeze()\n", + " configs.append(proc_config)\n", + "\n", + " envs = habitat.ThreadedVectorEnv(\n", + " make_env_fn=make_env_fn,\n", + " env_fn_args=tuple(zip(configs, env_classes)),\n", + " workers_ignore_signals=workers_ignore_signals,\n", + " )\n", + " return envs\n", + "\n", + "\n", + "class RearrangementBaselinePolicy(Policy):\n", + " def __init__(self, observation_space, action_space, hidden_size=512):\n", + " super().__init__(\n", + " RearrangementBaselineNet(\n", + " observation_space=observation_space, hidden_size=hidden_size\n", + " ),\n", + " action_space.n,\n", + " )\n", + "\n", + " def from_config(cls, config, envs):\n", + " pass\n", + "\n", + "\n", + "class RearrangementBaselineNet(Net):\n", + " r\"\"\"Network which passes the input image through CNN and concatenates\n", + " goal vector with CNN's output and passes that through RNN.\n", + " \"\"\"\n", + "\n", + " def __init__(self, observation_space, hidden_size):\n", + " super().__init__()\n", + "\n", + " self._n_input_goal = observation_space.spaces[\n", + " ObjectGoal.cls_uuid\n", + " ].shape[0]\n", + "\n", + " self._hidden_size = hidden_size\n", + "\n", + " self.state_encoder = build_rnn_state_encoder(\n", + " 2 * self._n_input_goal, self._hidden_size\n", + " )\n", + "\n", + " self.train()\n", + "\n", + " @property\n", + " def output_size(self):\n", + " return self._hidden_size\n", + "\n", + " @property\n", + " def is_blind(self):\n", + " return False\n", + "\n", + " @property\n", + " def num_recurrent_layers(self):\n", + " return self.state_encoder.num_recurrent_layers\n", + "\n", + " def forward(self, observations, rnn_hidden_states, prev_actions, masks):\n", + " object_goal_encoding = observations[ObjectGoal.cls_uuid]\n", + " object_pos_encoding = observations[ObjectPosition.cls_uuid]\n", + "\n", + " x = [object_goal_encoding, object_pos_encoding]\n", + "\n", + " x = torch.cat(x, dim=1)\n", + " x, rnn_hidden_states = self.state_encoder(x, rnn_hidden_states, masks)\n", + "\n", + " return x, rnn_hidden_states\n", + "\n", + "\n", + "@baseline_registry.register_trainer(name=\"ppo-rearrangement\")\n", + "class RearrangementTrainer(PPOTrainer):\n", + " supported_tasks = [\"RearrangementTask-v0\"]\n", + "\n", + " def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None:\n", + " r\"\"\"Sets up actor critic and agent for PPO.\n", + "\n", + " Args:\n", + " ppo_cfg: config node with relevant params\n", + "\n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " logger.add_filehandler(self.config.LOG_FILE)\n", + "\n", + " self.actor_critic = RearrangementBaselinePolicy(\n", + " observation_space=self.envs.observation_spaces[0],\n", + " action_space=self.envs.action_spaces[0],\n", + " hidden_size=ppo_cfg.hidden_size,\n", + " )\n", + " self.actor_critic.to(self.device)\n", + "\n", + " self.agent = PPO(\n", + " actor_critic=self.actor_critic,\n", + " clip_param=ppo_cfg.clip_param,\n", + " ppo_epoch=ppo_cfg.ppo_epoch,\n", + " num_mini_batch=ppo_cfg.num_mini_batch,\n", + " value_loss_coef=ppo_cfg.value_loss_coef,\n", + " entropy_coef=ppo_cfg.entropy_coef,\n", + " lr=ppo_cfg.lr,\n", + " eps=ppo_cfg.eps,\n", + " max_grad_norm=ppo_cfg.max_grad_norm,\n", + " use_normalized_advantage=ppo_cfg.use_normalized_advantage,\n", + " )\n", + "\n", + " def _init_envs(self, config=None):\n", + " if config is None:\n", + " config = self.config\n", + "\n", + " self.envs = construct_envs(config, get_env_class(config.ENV_NAME))\n", + "\n", + " def train(self) -> None:\n", + " r\"\"\"Main method for training PPO.\n", + "\n", + " Returns:\n", + " None\n", + " \"\"\"\n", + " if self._is_distributed:\n", + " raise RuntimeError(\"This trainer does not support distributed\")\n", + " self._init_train()\n", + "\n", + " count_checkpoints = 0\n", + "\n", + " lr_scheduler = LambdaLR(\n", + " optimizer=self.agent.optimizer,\n", + " lr_lambda=lambda _: 1 - self.percent_done(),\n", + " )\n", + " ppo_cfg = self.config.RL.PPO\n", + "\n", + " with TensorboardWriter(\n", + " self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs\n", + " ) as writer:\n", + " while not self.is_done():\n", + "\n", + " if ppo_cfg.use_linear_clip_decay:\n", + " self.agent.clip_param = ppo_cfg.clip_param * (\n", + " 1 - self.percent_done()\n", + " )\n", + "\n", + " count_steps_delta = 0\n", + " for _step in range(ppo_cfg.num_steps):\n", + " count_steps_delta += self._collect_rollout_step()\n", + "\n", + " (\n", + " value_loss,\n", + " action_loss,\n", + " dist_entropy,\n", + " ) = self._update_agent()\n", + "\n", + " if ppo_cfg.use_linear_lr_decay:\n", + " lr_scheduler.step() # type: ignore\n", + "\n", + " losses = self._coalesce_post_step(\n", + " dict(value_loss=value_loss, action_loss=action_loss),\n", + " count_steps_delta,\n", + " )\n", + " self.num_updates_done += 1\n", + "\n", + " deltas = {\n", + " k: (\n", + " (v[-1] - v[0]).sum().item()\n", + " if len(v) > 1\n", + " else v[0].sum().item()\n", + " )\n", + " for k, v in self.window_episode_stats.items()\n", + " }\n", + " deltas[\"count\"] = max(deltas[\"count\"], 1.0)\n", + "\n", + " writer.add_scalar(\n", + " \"reward\",\n", + " deltas[\"reward\"] / deltas[\"count\"],\n", + " self.num_steps_done,\n", + " )\n", + "\n", + " # Check to see if there are any metrics\n", + " # that haven't been logged yet\n", + "\n", + " for k, v in deltas.items():\n", + " if k not in {\"reward\", \"count\"}:\n", + " writer.add_scalar(\n", + " \"metric/\" + k,\n", + " v / deltas[\"count\"],\n", + " self.num_steps_done,\n", + " )\n", + "\n", + " losses = [value_loss, action_loss]\n", + " for l, k in zip(losses, [\"value, policy\"]):\n", + " writer.add_scalar(\"losses/\" + k, l, self.num_steps_done)\n", + "\n", + " # log stats\n", + " if self.num_updates_done % self.config.LOG_INTERVAL == 0:\n", + " logger.info(\n", + " \"update: {}\\tfps: {:.3f}\\t\".format(\n", + " self.num_updates_done,\n", + " self.num_steps_done / (time.time() - self.t_start),\n", + " )\n", + " )\n", + "\n", + " logger.info(\n", + " \"update: {}\\tenv-time: {:.3f}s\\tpth-time: {:.3f}s\\t\"\n", + " \"frames: {}\".format(\n", + " self.num_updates_done,\n", + " self.env_time,\n", + " self.pth_time,\n", + " self.num_steps_done,\n", + " )\n", + " )\n", + "\n", + " logger.info(\n", + " \"Average window size: {} {}\".format(\n", + " len(self.window_episode_stats[\"count\"]),\n", + " \" \".join(\n", + " \"{}: {:.3f}\".format(k, v / deltas[\"count\"])\n", + " for k, v in deltas.items()\n", + " if k != \"count\"\n", + " ),\n", + " )\n", + " )\n", + "\n", + " # checkpoint model\n", + " if self.should_checkpoint():\n", + " self.save_checkpoint(\n", + " f\"ckpt.{count_checkpoints}.pth\",\n", + " dict(step=self.num_steps_done),\n", + " )\n", + " count_checkpoints += 1\n", + "\n", + " self.envs.close()\n", + "\n", + " def eval(self) -> None:\n", + " r\"\"\"Evaluates the current model\n", + " Returns:\n", + " None\n", + " \"\"\"\n", + "\n", + " config = self.config.clone()\n", + "\n", + " if len(self.config.VIDEO_OPTION) > 0:\n", + " config.defrost()\n", + " config.NUM_ENVIRONMENTS = 1\n", + " config.freeze()\n", + "\n", + " logger.info(f\"env config: {config}\")\n", + " with construct_envs(config, get_env_class(config.ENV_NAME)) as envs:\n", + " observations = envs.reset()\n", + " batch = batch_obs(observations, device=self.device)\n", + "\n", + " current_episode_reward = torch.zeros(\n", + " envs.num_envs, 1, device=self.device\n", + " )\n", + " ppo_cfg = self.config.RL.PPO\n", + " test_recurrent_hidden_states = torch.zeros(\n", + " config.NUM_ENVIRONMENTS,\n", + " self.actor_critic.net.num_recurrent_layers,\n", + " ppo_cfg.hidden_size,\n", + " device=self.device,\n", + " )\n", + " prev_actions = torch.zeros(\n", + " config.NUM_ENVIRONMENTS,\n", + " 1,\n", + " device=self.device,\n", + " dtype=torch.long,\n", + " )\n", + " not_done_masks = torch.zeros(\n", + " config.NUM_ENVIRONMENTS,\n", + " 1,\n", + " device=self.device,\n", + " dtype=torch.bool,\n", + " )\n", + "\n", + " rgb_frames = [\n", + " [] for _ in range(self.config.NUM_ENVIRONMENTS)\n", + " ] # type: List[List[np.ndarray]]\n", + "\n", + " if len(config.VIDEO_OPTION) > 0:\n", + " os.makedirs(config.VIDEO_DIR, exist_ok=True)\n", + "\n", + " self.actor_critic.eval()\n", + "\n", + " for _i in range(config.TASK_CONFIG.ENVIRONMENT.MAX_EPISODE_STEPS):\n", + " current_episodes = envs.current_episodes()\n", + "\n", + " with torch.no_grad():\n", + " (\n", + " _,\n", + " actions,\n", + " _,\n", + " test_recurrent_hidden_states,\n", + " ) = self.actor_critic.act(\n", + " batch,\n", + " test_recurrent_hidden_states,\n", + " prev_actions,\n", + " not_done_masks,\n", + " deterministic=False,\n", + " )\n", + "\n", + " prev_actions.copy_(actions)\n", + "\n", + " outputs = envs.step([a[0].item() for a in actions])\n", + "\n", + " observations, rewards, dones, infos = [\n", + " list(x) for x in zip(*outputs)\n", + " ]\n", + " batch = batch_obs(observations, device=self.device)\n", + "\n", + " not_done_masks = torch.tensor(\n", + " [[not done] for done in dones],\n", + " dtype=torch.bool,\n", + " device=\"cpu\",\n", + " )\n", + "\n", + " rewards = torch.tensor(\n", + " rewards, dtype=torch.float, device=self.device\n", + " ).unsqueeze(1)\n", + "\n", + " current_episode_reward += rewards\n", + "\n", + " # episode ended\n", + " if not not_done_masks[0].item():\n", + " generate_video(\n", + " video_option=self.config.VIDEO_OPTION,\n", + " video_dir=self.config.VIDEO_DIR,\n", + " images=rgb_frames[0],\n", + " episode_id=current_episodes[0].episode_id,\n", + " checkpoint_idx=0,\n", + " metrics=self._extract_scalars_from_info(infos[0]),\n", + " tb_writer=None,\n", + " )\n", + "\n", + " print(\"Evaluation Finished.\")\n", + " print(\"Success: {}\".format(infos[0][\"episode_success\"]))\n", + " print(\n", + " \"Reward: {}\".format(current_episode_reward[0].item())\n", + " )\n", + " print(\n", + " \"Distance To Goal: {}\".format(\n", + " infos[0][\"object_to_goal_distance\"]\n", + " )\n", + " )\n", + "\n", + " return\n", + "\n", + " # episode continues\n", + " elif len(self.config.VIDEO_OPTION) > 0:\n", + " frame = observations_to_image(observations[0], infos[0])\n", + " rgb_frames[0].append(frame)\n", + "\n", + " not_done_masks = not_done_masks.to(device=self.device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext tensorboard\n", + "%tensorboard --logdir data/tb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Train an RL agent on a single episode\n", + "!if [ -d \"data/tb\" ]; then rm -r data/tb; fi\n", + "\n", + "import random\n", + "\n", + "import numpy as np\n", + "import torch\n", + "\n", + "import habitat\n", + "from habitat import Config\n", + "from habitat_baselines.config.default import get_config as get_baseline_config\n", + "\n", + "baseline_config = get_baseline_config(\n", + " \"habitat_baselines/config/pointnav/ppo_pointnav.yaml\"\n", + ")\n", + "baseline_config.defrost()\n", + "\n", + "baseline_config.TASK_CONFIG = config\n", + "baseline_config.TRAINER_NAME = \"ddppo\"\n", + "baseline_config.ENV_NAME = \"RearrangementRLEnv\"\n", + "baseline_config.SIMULATOR_GPU_ID = 0\n", + "baseline_config.TORCH_GPU_ID = 0\n", + "baseline_config.VIDEO_OPTION = [\"disk\"]\n", + "baseline_config.TENSORBOARD_DIR = \"data/tb\"\n", + "baseline_config.VIDEO_DIR = \"data/videos\"\n", + "baseline_config.NUM_ENVIRONMENTS = 2\n", + "baseline_config.SENSORS = [\"RGB_SENSOR\", \"DEPTH_SENSOR\"]\n", + "baseline_config.CHECKPOINT_FOLDER = \"data/checkpoints\"\n", + "baseline_config.TOTAL_NUM_STEPS = -1.0\n", + "\n", + "if vut.is_notebook():\n", + " baseline_config.NUM_UPDATES = 400 # @param {type:\"number\"}\n", + "else:\n", + " baseline_config.NUM_UPDATES = 1\n", + "\n", + "baseline_config.LOG_INTERVAL = 10\n", + "baseline_config.NUM_CHECKPOINTS = 5\n", + "baseline_config.LOG_FILE = \"data/checkpoints/train.log\"\n", + "baseline_config.EVAL.SPLIT = \"train\"\n", + "baseline_config.RL.SUCCESS_REWARD = 2.5 # @param {type:\"number\"}\n", + "baseline_config.RL.SUCCESS_MEASURE = \"object_to_goal_distance\"\n", + "baseline_config.RL.REWARD_MEASURE = \"object_to_goal_distance\"\n", + "baseline_config.RL.GRIPPED_SUCCESS_REWARD = 2.5 # @param {type:\"number\"}\n", + "\n", + "baseline_config.freeze()\n", + "random.seed(baseline_config.TASK_CONFIG.SEED)\n", + "np.random.seed(baseline_config.TASK_CONFIG.SEED)\n", + "torch.manual_seed(baseline_config.TASK_CONFIG.SEED)\n", + "\n", + "if __name__ == \"__main__\":\n", + " trainer = RearrangementTrainer(baseline_config)\n", + " trainer.train()\n", + " trainer.eval()\n", + "\n", + " if make_video:\n", + " video_file = os.listdir(\"data/videos\")[0]\n", + " vut.display_video(os.path.join(\"data/videos\", video_file))" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Habitat Interactive Tasks", + "provenance": [], + "toc_visible": true + }, + "jupytext": { + "cell_metadata_filter": "-all", + "formats": "nb_python//py:percent,colabs//ipynb", + "notebook_metadata_filter": "all" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/habitat-lab-dialog/examples/tutorials/colabs/Habitat_Lab.ipynb b/habitat-lab-dialog/examples/tutorials/colabs/Habitat_Lab.ipynb new file mode 100644 index 0000000..2310321 --- /dev/null +++ b/habitat-lab-dialog/examples/tutorials/colabs/Habitat_Lab.ipynb @@ -0,0 +1,598 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Installation\n", + "\n", + "!curl -L https://raw.githubusercontent.com/facebookresearch/habitat-sim/master/examples/colab_utils/colab_install.sh | NIGHTLY=true bash -s\n", + "!wget -c http://dl.fbaipublicfiles.com/habitat/mp3d_example.zip && unzip -o mp3d_example.zip -d /content/habitat-sim/data/scene_datasets/mp3d/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip uninstall --yes pyopenssl\n", + "!pip install pyopenssl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Colab Setup and Imports { display-mode: \"form\" }\n", + "# @markdown (double click to see the code)\n", + "\n", + "import os\n", + "import random\n", + "import sys\n", + "\n", + "import git\n", + "import numpy as np\n", + "from gym import spaces\n", + "\n", + "%matplotlib inline\n", + "from matplotlib import pyplot as plt\n", + "\n", + "%cd \"/content/habitat-lab\"\n", + "\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " # This tells imageio to use the system FFMPEG that has hardware acceleration.\n", + " os.environ[\"IMAGEIO_FFMPEG_EXE\"] = \"/usr/bin/ffmpeg\"\n", + "repo = git.Repo(\".\", search_parent_directories=True)\n", + "dir_path = repo.working_tree_dir\n", + "%cd $dir_path\n", + "\n", + "from PIL import Image\n", + "\n", + "import habitat\n", + "from habitat.core.logging import logger\n", + "from habitat.core.registry import registry\n", + "from habitat.sims.habitat_simulator.actions import HabitatSimActions\n", + "from habitat.tasks.nav.nav import NavigationTask\n", + "from habitat_baselines.common.baseline_registry import baseline_registry\n", + "from habitat_baselines.config.default import get_config as get_baselines_config" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Define Observation Display Utility Function { display-mode: \"form\" }\n", + "\n", + "# @markdown A convenient function that displays sensor observations with matplotlib.\n", + "\n", + "# @markdown (double click to see the code)\n", + "\n", + "\n", + "# Change to do something like this maybe: https://stackoverflow.com/a/41432704\n", + "def display_sample(\n", + " rgb_obs, semantic_obs=np.array([]), depth_obs=np.array([])\n", + "): # noqa B006\n", + " from habitat_sim.utils.common import d3_40_colors_rgb\n", + "\n", + " rgb_img = Image.fromarray(rgb_obs, mode=\"RGB\")\n", + "\n", + " arr = [rgb_img]\n", + " titles = [\"rgb\"]\n", + " if semantic_obs.size != 0:\n", + " semantic_img = Image.new(\n", + " \"P\", (semantic_obs.shape[1], semantic_obs.shape[0])\n", + " )\n", + " semantic_img.putpalette(d3_40_colors_rgb.flatten())\n", + " semantic_img.putdata((semantic_obs.flatten() % 40).astype(np.uint8))\n", + " semantic_img = semantic_img.convert(\"RGBA\")\n", + " arr.append(semantic_img)\n", + " titles.append(\"semantic\")\n", + "\n", + " if depth_obs.size != 0:\n", + " depth_img = Image.fromarray(\n", + " (depth_obs / 10 * 255).astype(np.uint8), mode=\"L\"\n", + " )\n", + " arr.append(depth_img)\n", + " titles.append(\"depth\")\n", + "\n", + " plt.figure(figsize=(12, 8))\n", + " for i, data in enumerate(arr):\n", + " ax = plt.subplot(1, 3, i + 1)\n", + " ax.axis(\"off\")\n", + " ax.set_title(titles[i])\n", + " plt.imshow(data)\n", + " plt.show(block=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup PointNav Task" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cat \"./configs/test/habitat_all_sensors_test.yaml\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " config = habitat.get_config(\n", + " config_paths=\"./configs/test/habitat_all_sensors_test.yaml\"\n", + " )\n", + "\n", + " try:\n", + " env.close()\n", + " except NameError:\n", + " pass\n", + " env = habitat.Env(config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " action = None\n", + " obs = env.reset()\n", + " valid_actions = [\"TURN_LEFT\", \"TURN_RIGHT\", \"MOVE_FORWARD\", \"STOP\"]\n", + " interactive_control = False # @param {type:\"boolean\"}\n", + " while action != \"STOP\":\n", + " display_sample(obs[\"rgb\"])\n", + " print(\n", + " \"distance to goal: {:.2f}\".format(\n", + " obs[\"pointgoal_with_gps_compass\"][0]\n", + " )\n", + " )\n", + " print(\n", + " \"angle to goal (radians): {:.2f}\".format(\n", + " obs[\"pointgoal_with_gps_compass\"][1]\n", + " )\n", + " )\n", + " if interactive_control:\n", + " action = input(\n", + " \"enter action out of {}:\\n\".format(\", \".join(valid_actions))\n", + " )\n", + " assert (\n", + " action in valid_actions\n", + " ), \"invalid action {} entered, choose one amongst \" + \",\".join(\n", + " valid_actions\n", + " )\n", + " else:\n", + " action = valid_actions.pop()\n", + " obs = env.step(\n", + " {\n", + " \"action\": action,\n", + " }\n", + " )\n", + "\n", + " env.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " print(env.get_metrics())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## RL Training" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " config = get_baselines_config(\n", + " \"./habitat_baselines/config/pointnav/ppo_pointnav_example.yaml\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set random seeds\n", + "if __name__ == \"__main__\":\n", + " seed = \"42\" # @param {type:\"string\"}\n", + " steps_in_thousands = \"10\" # @param {type:\"string\"}\n", + "\n", + " config.defrost()\n", + " config.TASK_CONFIG.SEED = int(seed)\n", + " config.TOTAL_NUM_STEPS = int(steps_in_thousands)\n", + " config.LOG_INTERVAL = 1\n", + " config.freeze()\n", + "\n", + " random.seed(config.TASK_CONFIG.SEED)\n", + " np.random.seed(config.TASK_CONFIG.SEED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME)\n", + " trainer = trainer_init(config)\n", + " trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "lines_to_next_cell": 0 + }, + "outputs": [], + "source": [ + "# @markdown (double click to see the code)\n", + "\n", + "# example tensorboard visualization\n", + "# for more details refer to [link](https://github.com/facebookresearch/habitat-lab/tree/master/habitat_baselines#additional-utilities).\n", + "\n", + "try:\n", + " from IPython import display\n", + "\n", + " with open(\"./res/img/tensorboard_video_demo.gif\", \"rb\") as f:\n", + " display.display(display.Image(data=f.read(), format=\"png\"))\n", + "except ImportError:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Key Concepts\n", + "\n", + "All the concepts link to their definitions:\n", + "\n", + "1. [`habitat.sims.habitat_simulator.HabitatSim`](https://github.com/facebookresearch/habitat-lab/blob/master/habitat/sims/habitat_simulator/habitat_simulator.py#L159)\n", + "Thin wrapper over `habitat_sim` providing seamless integration with experimentation framework.\n", + "\n", + "\n", + "2. [`habitat.core.env.Env`](https://github.com/facebookresearch/habitat-lab/blob/master/habitat/core/env.py)\n", + "Abstraction for the universe of agent, task and simulator. Agents that you train and evaluate operate inside the environment.\n", + "\n", + "\n", + "3. [`habitat.core.env.RLEnv`](https://github.com/facebookresearch/habitat-lab/blob/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat/core/env.py#L278)\n", + "Extends the `Env` class for reinforcement learning by defining the reward and other required components.\n", + "\n", + "\n", + "4. [`habitat.core.embodied_task.EmbodiedTask`](https://github.com/facebookresearch/habitat-lab/blob/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat/core/embodied_task.py#L242)\n", + "Defines the task that the agent needs to solve. This class holds the definition of observation space, action space, measures, simulator usage. Eg: PointNav, ObjectNav.\n", + "\n", + "\n", + "5. [`habitat.core.dataset.Dataset`](https://github.com/facebookresearch/habitat-lab/blob/4b6da1c4f8eb287cea43e70c50fe1d615a261198/habitat/core/dataset.py#L63)\n", + "Wrapper over information required for the dataset of embodied task, contains definition and interaction with an `episode`.\n", + "\n", + "\n", + "6. [`habitat.core.embodied_task.Measure`](https://github.com/facebookresearch/habitat-lab/blob/master/habitat/core/embodied_task.py#L82)\n", + "Defines the metrics for embodied task, eg: [SPL](https://github.com/facebookresearch/habitat-lab/blob/d0db1b55be57abbacc5563dca2ca14654c545552/habitat/tasks/nav/nav.py#L533).\n", + "\n", + "\n", + "7. [`habitat_baselines`](https://github.com/facebookresearch/habitat-lab/tree/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat_baselines)\n", + "RL, SLAM, heuristic baseline implementations for the different embodied tasks." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a new Task" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " config = habitat.get_config(\n", + " config_paths=\"./configs/test/habitat_all_sensors_test.yaml\"\n", + " )\n", + "\n", + "\n", + "@registry.register_task(name=\"TestNav-v0\")\n", + "class NewNavigationTask(NavigationTask):\n", + " def __init__(self, config, sim, dataset):\n", + " logger.info(\"Creating a new type of task\")\n", + " super().__init__(config=config, sim=sim, dataset=dataset)\n", + "\n", + " def _check_episode_is_active(self, *args, **kwargs):\n", + " logger.info(\n", + " \"Current agent position: {}\".format(self._sim.get_agent_state())\n", + " )\n", + " collision = self._sim.previous_step_collided\n", + " stop_called = not getattr(self, \"is_stop_called\", False)\n", + " return collision or stop_called\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " config.defrost()\n", + " config.TASK.TYPE = \"TestNav-v0\"\n", + " config.freeze()\n", + "\n", + " try:\n", + " env.close()\n", + " except NameError:\n", + " pass\n", + " env = habitat.Env(config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " action = None\n", + " env.reset()\n", + " valid_actions = [\"TURN_LEFT\", \"TURN_RIGHT\", \"MOVE_FORWARD\", \"STOP\"]\n", + " interactive_control = False # @param {type:\"boolean\"}\n", + " while env.episode_over is not True:\n", + " display_sample(obs[\"rgb\"])\n", + " if interactive_control:\n", + " action = input(\n", + " \"enter action out of {}:\\n\".format(\", \".join(valid_actions))\n", + " )\n", + " assert (\n", + " action in valid_actions\n", + " ), \"invalid action {} entered, choose one amongst \" + \",\".join(\n", + " valid_actions\n", + " )\n", + " else:\n", + " action = valid_actions.pop()\n", + " obs = env.step(\n", + " {\n", + " \"action\": action,\n", + " \"action_args\": None,\n", + " }\n", + " )\n", + " print(\"Episode over:\", env.episode_over)\n", + "\n", + " env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a new Sensor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "@registry.register_sensor(name=\"agent_position_sensor\")\n", + "class AgentPositionSensor(habitat.Sensor):\n", + " def __init__(self, sim, config, **kwargs):\n", + " super().__init__(config=config)\n", + " self._sim = sim\n", + "\n", + " # Defines the name of the sensor in the sensor suite dictionary\n", + " def _get_uuid(self, *args, **kwargs):\n", + " return \"agent_position\"\n", + "\n", + " # Defines the type of the sensor\n", + " def _get_sensor_type(self, *args, **kwargs):\n", + " return habitat.SensorTypes.POSITION\n", + "\n", + " # Defines the size and range of the observations of the sensor\n", + " def _get_observation_space(self, *args, **kwargs):\n", + " return spaces.Box(\n", + " low=np.finfo(np.float32).min,\n", + " high=np.finfo(np.float32).max,\n", + " shape=(3,),\n", + " dtype=np.float32,\n", + " )\n", + "\n", + " # This is called whenver reset is called or an action is taken\n", + " def get_observation(self, observations, *args, episode, **kwargs):\n", + " return self._sim.get_agent_state().position" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " config = habitat.get_config(\n", + " config_paths=\"./configs/test/habitat_all_sensors_test.yaml\"\n", + " )\n", + "\n", + " config.defrost()\n", + " # Now define the config for the sensor\n", + " config.TASK.AGENT_POSITION_SENSOR = habitat.Config()\n", + " # Use the custom name\n", + " config.TASK.AGENT_POSITION_SENSOR.TYPE = \"agent_position_sensor\"\n", + " # Add the sensor to the list of sensors in use\n", + " config.TASK.SENSORS.append(\"AGENT_POSITION_SENSOR\")\n", + " config.freeze()\n", + "\n", + " try:\n", + " env.close()\n", + " except NameError:\n", + " pass\n", + " env = habitat.Env(config=config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " obs = env.reset()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " obs.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " print(obs[\"agent_position\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + " env.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a new Agent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# An example agent which can be submitted to habitat-challenge.\n", + "# To participate and for more details refer to:\n", + "# - https://aihabitat.org/challenge/2020/\n", + "# - https://github.com/facebookresearch/habitat-challenge\n", + "\n", + "\n", + "class ForwardOnlyAgent(habitat.Agent):\n", + " def __init__(self, success_distance, goal_sensor_uuid):\n", + " self.dist_threshold_to_stop = success_distance\n", + " self.goal_sensor_uuid = goal_sensor_uuid\n", + "\n", + " def reset(self):\n", + " pass\n", + "\n", + " def is_goal_reached(self, observations):\n", + " dist = observations[self.goal_sensor_uuid][0]\n", + " return dist <= self.dist_threshold_to_stop\n", + "\n", + " def act(self, observations):\n", + " if self.is_goal_reached(observations):\n", + " action = HabitatSimActions.STOP\n", + " else:\n", + " action = HabitatSimActions.MOVE_FORWARD\n", + " return {\"action\": action}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Other Examples\n", + "\n", + "[Create a new action space](https://github.com/facebookresearch/habitat-lab/blob/master/examples/new_actions.py)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# @title Sim2Real with Habitat { display-mode: \"form\" }\n", + "\n", + "try:\n", + " from IPython.display import HTML\n", + "\n", + " HTML(\n", + " ''\n", + " )\n", + "except ImportError:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Deploy habitat-sim trained models on real robots with the [habitat-pyrobot bridge](https://github.com/facebookresearch/habitat-lab/blob/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat/sims/pyrobot/pyrobot.py)\n", + "\n", + "```python\n", + "# Are we in sim or reality?\n", + "if args.use_robot: # Use LoCoBot via PyRobot\n", + " config.SIMULATOR.TYPE = \"PyRobot-Locobot-v0\"\n", + "else: # Use simulation\n", + " config.SIMULATOR.TYPE = \"Habitat-Sim-v0\"\n", + "```\n", + "\n", + "Paper: [https://arxiv.org/abs/1912.06321](https://arxiv.org/abs/1912.06321)" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "Habitat Lab", + "provenance": [] + }, + "jupytext": { + "cell_metadata_filter": "-all", + "formats": "nb_python//py:percent,colabs//ipynb", + "main_language": "python", + "notebook_metadata_filter": "all" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/habitat-lab-dialog/examples/tutorials/nb_python/Habitat_Interactive_Tasks.py b/habitat-lab-dialog/examples/tutorials/nb_python/Habitat_Interactive_Tasks.py new file mode 100644 index 0000000..0e42bf4 --- /dev/null +++ b/habitat-lab-dialog/examples/tutorials/nb_python/Habitat_Interactive_Tasks.py @@ -0,0 +1,2060 @@ +# --- +# jupyter: +# accelerator: GPU +# colab: +# collapsed_sections: [] +# name: Habitat Interactive Tasks +# provenance: [] +# toc_visible: true +# jupytext: +# cell_metadata_filter: -all +# formats: nb_python//py:percent,colabs//ipynb +# notebook_metadata_filter: all +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.5.2 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# language_info: +# codemirror_mode: +# name: ipython +# version: 3 +# file_extension: .py +# mimetype: text/x-python +# name: python +# nbconvert_exporter: python +# pygments_lexer: ipython3 +# version: 3.7.3 +# --- + +# %% [markdown] +# # Furniture Rearrangement - How to setup a new interaction task in Habitat-Lab +# +# This tutorial demonstrates how to setup a new task in Habitat that utilizes interaction capabilities in Habitat Simulator. +# +# ![teaser](https://drive.google.com/uc?id=1pupGvb4dGefd0T_23GpeDkkcIocDHSL_) +# +# ## Task Definition: +# The working example in this demo will be the task of **Furniture Rearrangement** - The agent will be randomly spawned in an environment in which the furniture are initially displaced from their desired position. The agent is tasked with navigating the environment, picking furniture and putting them in the desired position. To keep the tutorial simple and easy to follow, we will rearrange just a single object. +# +# To setup this task, we will build on top of existing API in Habitat-Simulator and Habitat-Lab. Here is a summary of all the steps involved in setting up this task: +# +# 1. **Setup the Simulator**: Using existing functionalities of the Habitat-Sim, we can add or remove objects from the scene. We will use these methods to spawn the agent and the objects at some pre-defined initial configuration. +# 2. **Create a New Dataset**: We will define a new dataset class to save / load a list of episodes for the agent to train and evaluate on. +# 3. **Grab / Release Action**: We will add the "grab/release" action to the agent's action space to allow the agent to pickup / drop an object under a crosshair. +# 4. **Extend the Simulator Class**: We will extend the Simulator Class to add support for new actions implemented in previous step and add other additional utility functions +# 5. **Create a New Task**: Create a new task definition, implement new *sensors* and *metrics*. +# 6. **Train an RL agent**: We will define rewards for this task and utilize it to train an RL agent using the PPO algorithm. +# +# Let's get started! + +# %% +# @title Installation { display-mode: "form" } +# @markdown (double click to show code). + +# !curl -L https://raw.githubusercontent.com/facebookresearch/habitat-sim/master/examples/colab_utils/colab_install.sh | NIGHTLY=true bash -s +# %cd /content + +# !gdown --id 1Pc-J6pZzXEd8RSeLM94t3iwO8q_RQ853 +# !unzip -o /content/coda.zip -d /content/habitat-sim/data/scene_datasets + +# reload the cffi version +import sys + +if "google.colab" in sys.modules: + import importlib + + import cffi + + importlib.reload(cffi) + +# %% +# @title Path Setup and Imports { display-mode: "form" } +# @markdown (double click to show code). + +# %cd /content/habitat-lab + +## [setup] +import gzip +import json +import os +import sys +from typing import Any, Dict, List, Optional, Type + +import attr +import cv2 +import git +import magnum as mn +import numpy as np + +# %matplotlib inline +from matplotlib import pyplot as plt +from PIL import Image + +import habitat +import habitat_sim +from habitat.config import Config +from habitat.core.registry import registry +from habitat_sim.utils import viz_utils as vut + +if "google.colab" in sys.modules: + os.environ["IMAGEIO_FFMPEG_EXE"] = "/usr/bin/ffmpeg" + +repo = git.Repo(".", search_parent_directories=True) +dir_path = repo.working_tree_dir +# %cd $dir_path +data_path = os.path.join(dir_path, "data") +output_directory = "data/tutorials/output/" # @param {type:"string"} +output_path = os.path.join(dir_path, output_directory) + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--no-display", dest="display", action="store_false") + parser.add_argument( + "--no-make-video", dest="make_video", action="store_false" + ) + parser.set_defaults(show_video=True, make_video=True) + args, _ = parser.parse_known_args() + show_video = args.display + display = args.display + make_video = args.make_video +else: + show_video = False + make_video = False + display = False + +if make_video and not os.path.exists(output_path): + os.makedirs(output_path) + + +# %% +# @title Util functions to visualize observations +# @markdown - `make_video_cv2`: Renders a video from a list of observations +# @markdown - `simulate`: Runs simulation for a given amount of time at 60Hz +# @markdown - `simulate_and_make_vid` Runs simulation and creates video + + +def make_video_cv2( + observations, cross_hair=None, prefix="", open_vid=True, fps=60 +): + sensor_keys = list(observations[0]) + videodims = observations[0][sensor_keys[0]].shape + videodims = (videodims[1], videodims[0]) # flip to w,h order + print(videodims) + video_file = output_path + prefix + ".mp4" + print("Encoding the video: %s " % video_file) + writer = vut.get_fast_video_writer(video_file, fps=fps) + for ob in observations: + # If in RGB/RGBA format, remove the alpha channel + rgb_im_1st_person = cv2.cvtColor(ob["rgb"], cv2.COLOR_RGBA2RGB) + if cross_hair is not None: + rgb_im_1st_person[ + cross_hair[0] - 2 : cross_hair[0] + 2, + cross_hair[1] - 2 : cross_hair[1] + 2, + ] = [255, 0, 0] + + if rgb_im_1st_person.shape[:2] != videodims: + rgb_im_1st_person = cv2.resize( + rgb_im_1st_person, videodims, interpolation=cv2.INTER_AREA + ) + # write the 1st person observation to video + writer.append_data(rgb_im_1st_person) + writer.close() + + if open_vid: + print("Displaying video") + vut.display_video(video_file) + + +def simulate(sim, dt=1.0, get_frames=True): + # simulate dt seconds at 60Hz to the nearest fixed timestep + print("Simulating " + str(dt) + " world seconds.") + observations = [] + start_time = sim.get_world_time() + while sim.get_world_time() < start_time + dt: + sim.step_physics(1.0 / 60.0) + if get_frames: + observations.append(sim.get_sensor_observations()) + return observations + + +# convenience wrapper for simulate and make_video_cv2 +def simulate_and_make_vid(sim, crosshair, prefix, dt=1.0, open_vid=True): + observations = simulate(sim, dt) + make_video_cv2(observations, crosshair, prefix=prefix, open_vid=open_vid) + + +def display_sample( + rgb_obs, + semantic_obs=np.array([]), + depth_obs=np.array([]), + key_points=None, # noqa: B006 +): + from habitat_sim.utils.common import d3_40_colors_rgb + + rgb_img = Image.fromarray(rgb_obs, mode="RGB") + + arr = [rgb_img] + titles = ["rgb"] + if semantic_obs.size != 0: + semantic_img = Image.new( + "P", (semantic_obs.shape[1], semantic_obs.shape[0]) + ) + semantic_img.putpalette(d3_40_colors_rgb.flatten()) + semantic_img.putdata((semantic_obs.flatten() % 40).astype(np.uint8)) + semantic_img = semantic_img.convert("RGBA") + arr.append(semantic_img) + titles.append("semantic") + + if depth_obs.size != 0: + depth_img = Image.fromarray( + (depth_obs / 10 * 255).astype(np.uint8), mode="L" + ) + arr.append(depth_img) + titles.append("depth") + + plt.figure(figsize=(12, 8)) + for i, data in enumerate(arr): + ax = plt.subplot(1, 3, i + 1) + ax.axis("off") + ax.set_title(titles[i]) + # plot points on images + if key_points is not None: + for point in key_points: + plt.plot( + point[0], point[1], marker="o", markersize=10, alpha=0.8 + ) + plt.imshow(data) + + plt.show(block=False) + + +# %% [markdown] +# ## 1. Setup the Simulator +# +# --- +# +# + +# %% +# @title Setup simulator configuration +# @markdown We'll start with setting up simulator with the following configurations +# @markdown - The simulator will render both RGB, Depth observations of 256x256 resolution. +# @markdown - The actions available will be `move_forward`, `turn_left`, `turn_right`. + + +def make_cfg(settings): + sim_cfg = habitat_sim.SimulatorConfiguration() + sim_cfg.gpu_device_id = 0 + sim_cfg.default_agent_id = settings["default_agent_id"] + sim_cfg.scene_id = settings["scene"] + sim_cfg.enable_physics = settings["enable_physics"] + sim_cfg.physics_config_file = settings["physics_config_file"] + + # Note: all sensors must have the same resolution + sensors = { + "rgb": { + "sensor_type": habitat_sim.SensorType.COLOR, + "resolution": [settings["height"], settings["width"]], + "position": [0.0, settings["sensor_height"], 0.0], + }, + "depth": { + "sensor_type": habitat_sim.SensorType.DEPTH, + "resolution": [settings["height"], settings["width"]], + "position": [0.0, settings["sensor_height"], 0.0], + }, + } + + sensor_specs = [] + for sensor_uuid, sensor_params in sensors.items(): + if settings[sensor_uuid]: + sensor_spec = habitat_sim.SensorSpec() + sensor_spec.uuid = sensor_uuid + sensor_spec.sensor_type = sensor_params["sensor_type"] + sensor_spec.resolution = sensor_params["resolution"] + sensor_spec.position = sensor_params["position"] + + sensor_specs.append(sensor_spec) + + # Here you can specify the amount of displacement in a forward action and the turn angle + agent_cfg = habitat_sim.agent.AgentConfiguration() + agent_cfg.sensor_specifications = sensor_specs + agent_cfg.action_space = { + "move_forward": habitat_sim.agent.ActionSpec( + "move_forward", habitat_sim.agent.ActuationSpec(amount=0.1) + ), + "turn_left": habitat_sim.agent.ActionSpec( + "turn_left", habitat_sim.agent.ActuationSpec(amount=10.0) + ), + "turn_right": habitat_sim.agent.ActionSpec( + "turn_right", habitat_sim.agent.ActuationSpec(amount=10.0) + ), + } + + return habitat_sim.Configuration(sim_cfg, [agent_cfg]) + + +settings = { + "max_frames": 10, + "width": 256, + "height": 256, + "scene": "data/scene_datasets/coda/coda.glb", + "default_agent_id": 0, + "sensor_height": 1.5, # Height of sensors in meters + "rgb": True, # RGB sensor + "depth": True, # Depth sensor + "seed": 1, + "enable_physics": True, + "physics_config_file": "data/default.physics_config.json", + "silent": False, + "compute_shortest_path": False, + "compute_action_shortest_path": False, + "save_png": True, +} + +cfg = make_cfg(settings) + + +# %% +# @title Spawn the agent at a pre-defined location + + +def init_agent(sim): + agent_pos = np.array([-0.15776923, 0.18244143, 0.2988735]) + + # Place the agent + sim.agents[0].scene_node.translation = agent_pos + agent_orientation_y = -40 + sim.agents[0].scene_node.rotation = mn.Quaternion.rotation( + mn.Deg(agent_orientation_y), mn.Vector3(0, 1.0, 0) + ) + + +cfg.sim_cfg.default_agent_id = 0 +with habitat_sim.Simulator(cfg) as sim: + init_agent(sim) + if make_video: + # Visualize the agent's initial position + simulate_and_make_vid( + sim, None, "sim-init", dt=1.0, open_vid=show_video + ) + + +# %% +# @title Set the object's initial and final position +# @markdown Defines two utility functions: +# @markdown - `remove_all_objects`: This will remove all objects from the scene +# @markdown - `set_object_in_front_of_agent`: This will add an object in the scene in front of the agent at the specified distance. + +# @markdown Here we add a chair *3.0m* away from the agent and the task is to place the agent at the desired final position which is *7.0m* in front of the agent. + + +def remove_all_objects(sim): + for obj_id in sim.get_existing_object_ids(): + sim.remove_object(obj_id) + + +def set_object_in_front_of_agent(sim, obj_id, z_offset=-1.5): + r""" + Adds an object in front of the agent at some distance. + """ + agent_transform = sim.agents[0].scene_node.transformation_matrix() + obj_translation = agent_transform.transform_point( + np.array([0, 0, z_offset]) + ) + sim.set_translation(obj_translation, obj_id) + + obj_node = sim.get_object_scene_node(obj_id) + xform_bb = habitat_sim.geo.get_transformed_bb( + obj_node.cumulative_bb, obj_node.transformation + ) + + # also account for collision margin of the scene + scene_collision_margin = 0.04 + y_translation = mn.Vector3( + 0, xform_bb.size_y() / 2.0 + scene_collision_margin, 0 + ) + sim.set_translation(y_translation + sim.get_translation(obj_id), obj_id) + + +def init_objects(sim): + # Manager of Object Attributes Templates + obj_attr_mgr = sim.get_object_template_manager() + obj_attr_mgr.load_configs( + str(os.path.join(data_path, "test_assets/objects")) + ) + + # Add a chair into the scene. + obj_path = "test_assets/objects/chair" + chair_template_id = obj_attr_mgr.load_object_configs( + str(os.path.join(data_path, obj_path)) + )[0] + chair_attr = obj_attr_mgr.get_template_by_ID(chair_template_id) + obj_attr_mgr.register_template(chair_attr) + + # Object's initial position 3m away from the agent. + object_id = sim.add_object_by_handle(chair_attr.handle) + set_object_in_front_of_agent(sim, object_id, -3.0) + sim.set_object_motion_type( + habitat_sim.physics.MotionType.STATIC, object_id + ) + + # Object's final position 7m away from the agent + goal_id = sim.add_object_by_handle(chair_attr.handle) + set_object_in_front_of_agent(sim, goal_id, -7.0) + sim.set_object_motion_type(habitat_sim.physics.MotionType.STATIC, goal_id) + + return object_id, goal_id + + +with habitat_sim.Simulator(cfg) as sim: + init_agent(sim) + init_objects(sim) + + # Visualize the scene after the chair is added into the scene. + if make_video: + simulate_and_make_vid( + sim, None, "object-init", dt=1.0, open_vid=show_video + ) + + +# %% [markdown] +# ## Rearrangement Dataset +# ![dataset](https://drive.google.com/uc?id=1y0qS0MifmJsZ0F4jsRZGI9BrXzslFLn7) +# +# In the previous section, we created a single episode of the rearrangement task. Let's define a format to store all the necessary information about a single episode. It should store the *scene* the episode belongs to, *initial spawn position and orientation* of the agent, *object type*, object's *initial position and orientation* as well as *final position and orientation*. +# +# The format will be as follows: +# ``` +# { +# 'episode_id': 0, +# 'scene_id': 'data/scene_datasets/coda/coda.glb', +# 'goals': { +# 'position': [4.34, 0.67, -5.06], +# 'rotation': [0.0, 0.0, 0.0, 1.0] +# }, +# 'objects': { +# 'object_id': 0, +# 'object_template': 'data/test_assets/objects/chair', +# 'position': [1.77, 0.67, -1.99], +# 'rotation': [0.0, 0.0, 0.0, 1.0] +# }, +# 'start_position': [-0.15, 0.18, 0.29], +# 'start_rotation': [-0.0, -0.34, -0.0, 0.93]} +# } +# ``` +# Once an episode is defined, a dataset will just be a collection of such episodes. For simplicity, in this notebook, the dataset will only contain one episode defined above. +# + +# %% +# @title Create a new dataset +# @markdown Utility functions to define and save the dataset for the rearrangement task + + +def get_rotation(sim, object_id): + quat = sim.get_rotation(object_id) + return np.array(quat.vector).tolist() + [quat.scalar] + + +def init_episode_dict(episode_id, scene_id, agent_pos, agent_rot): + episode_dict = { + "episode_id": episode_id, + "scene_id": "data/scene_datasets/coda/coda.glb", + "start_position": agent_pos, + "start_rotation": agent_rot, + "info": {}, + } + return episode_dict + + +def add_object_details(sim, episode_dict, obj_id, object_template, object_id): + object_template = { + "object_id": obj_id, + "object_template": object_template, + "position": np.array(sim.get_translation(object_id)).tolist(), + "rotation": get_rotation(sim, object_id), + } + episode_dict["objects"] = object_template + return episode_dict + + +def add_goal_details(sim, episode_dict, object_id): + goal_template = { + "position": np.array(sim.get_translation(object_id)).tolist(), + "rotation": get_rotation(sim, object_id), + } + episode_dict["goals"] = goal_template + return episode_dict + + +# set the number of objects to 1 always for now. +def build_episode(sim, episode_num, object_id, goal_id): + episodes = {"episodes": []} + for episode in range(episode_num): + agent_state = sim.get_agent(0).get_state() + agent_pos = np.array(agent_state.position).tolist() + agent_quat = agent_state.rotation + agent_rot = np.array(agent_quat.vec).tolist() + [agent_quat.real] + episode_dict = init_episode_dict( + episode, settings["scene"], agent_pos, agent_rot + ) + + object_attr = sim.get_object_initialization_template(object_id) + object_path = os.path.relpath( + os.path.splitext(object_attr.render_asset_handle)[0] + ) + + episode_dict = add_object_details( + sim, episode_dict, 0, object_path, object_id + ) + episode_dict = add_goal_details(sim, episode_dict, goal_id) + episodes["episodes"].append(episode_dict) + + return episodes + + +with habitat_sim.Simulator(cfg) as sim: + init_agent(sim) + object_id, goal_id = init_objects(sim) + + episodes = build_episode(sim, 1, object_id, goal_id) + + dataset_content_path = "data/datasets/rearrangement/coda/v1/train/" + if not os.path.exists(dataset_content_path): + os.makedirs(dataset_content_path) + + with gzip.open( + os.path.join(dataset_content_path, "train.json.gz"), "wt" + ) as f: + json.dump(episodes, f) + + print( + "Dataset written to {}".format( + os.path.join(dataset_content_path, "train.json.gz") + ) + ) + + +# %% +# @title Dataset class to read the saved dataset in Habitat-Lab. +# @markdown To read the saved episodes in Habitat-Lab, we will extend the `Dataset` class and the `Episode` base class. It will help provide all the relevant details about the episode through a consistent API to all downstream tasks. + +# @markdown - We will first create a `RearrangementEpisode` by extending the `NavigationEpisode` to include additional information about object's initial configuration and desired final configuration. +# @markdown - We will then define a `RearrangementDatasetV0` class that builds on top of `PointNavDatasetV1` class to read the JSON file stored earlier and initialize a list of `RearrangementEpisode`. + +from habitat.core.utils import DatasetFloatJSONEncoder, not_none_validator +from habitat.datasets.pointnav.pointnav_dataset import ( + CONTENT_SCENES_PATH_FIELD, + DEFAULT_SCENE_PATH_PREFIX, + PointNavDatasetV1, +) +from habitat.tasks.nav.nav import NavigationEpisode + + +@attr.s(auto_attribs=True, kw_only=True) +class RearrangementSpec: + r"""Specifications that capture a particular position of final position + or initial position of the object. + """ + + position: List[float] = attr.ib(default=None, validator=not_none_validator) + rotation: List[float] = attr.ib(default=None, validator=not_none_validator) + info: Optional[Dict[str, str]] = attr.ib(default=None) + + +@attr.s(auto_attribs=True, kw_only=True) +class RearrangementObjectSpec(RearrangementSpec): + r"""Object specifications that capture position of each object in the scene, + the associated object template. + """ + object_id: str = attr.ib(default=None, validator=not_none_validator) + object_template: Optional[str] = attr.ib( + default="data/test_assets/objects/chair" + ) + + +@attr.s(auto_attribs=True, kw_only=True) +class RearrangementEpisode(NavigationEpisode): + r"""Specification of episode that includes initial position and rotation + of agent, all goal specifications, all object specifications + + Args: + episode_id: id of episode in the dataset + scene_id: id of scene inside the simulator. + start_position: numpy ndarray containing 3 entries for (x, y, z). + start_rotation: numpy ndarray with 4 entries for (x, y, z, w) + elements of unit quaternion (versor) representing agent 3D + orientation. + goal: object's goal position and rotation + object: object's start specification defined with object type, + position, and rotation. + """ + objects: RearrangementObjectSpec = attr.ib( + default=None, validator=not_none_validator + ) + goals: RearrangementSpec = attr.ib( + default=None, validator=not_none_validator + ) + + +@registry.register_dataset(name="RearrangementDataset-v0") +class RearrangementDatasetV0(PointNavDatasetV1): + r"""Class inherited from PointNavDataset that loads Rearrangement dataset.""" + episodes: List[RearrangementEpisode] + content_scenes_path: str = "{data_path}/content/{scene}.json.gz" + + def to_json(self) -> str: + result = DatasetFloatJSONEncoder().encode(self) + return result + + def __init__(self, config: Optional[Config] = None) -> None: + super().__init__(config) + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None + ) -> None: + deserialized = json.loads(json_str) + if CONTENT_SCENES_PATH_FIELD in deserialized: + self.content_scenes_path = deserialized[CONTENT_SCENES_PATH_FIELD] + + for i, episode in enumerate(deserialized["episodes"]): + rearrangement_episode = RearrangementEpisode(**episode) + rearrangement_episode.episode_id = str(i) + + if scenes_dir is not None: + if rearrangement_episode.scene_id.startswith( + DEFAULT_SCENE_PATH_PREFIX + ): + rearrangement_episode.scene_id = ( + rearrangement_episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX) : + ] + ) + + rearrangement_episode.scene_id = os.path.join( + scenes_dir, rearrangement_episode.scene_id + ) + + rearrangement_episode.objects = RearrangementObjectSpec( + **rearrangement_episode.objects + ) + rearrangement_episode.goals = RearrangementSpec( + **rearrangement_episode.goals + ) + + self.episodes.append(rearrangement_episode) + + +# %% +# @title Load the saved dataset using the Dataset class +config = habitat.get_config("configs/datasets/pointnav/habitat_test.yaml") +config.defrost() +config.DATASET.DATA_PATH = ( + "data/datasets/rearrangement/coda/v1/{split}/{split}.json.gz" +) +config.DATASET.TYPE = "RearrangementDataset-v0" +config.freeze() + +dataset = RearrangementDatasetV0(config.DATASET) + +# check if the dataset got correctly deserialized +assert len(dataset.episodes) == 1 + +assert dataset.episodes[0].objects.position == [ + 1.770593523979187, + 0.6726829409599304, + -1.9992598295211792, +] +assert dataset.episodes[0].objects.rotation == [0.0, 0.0, 0.0, 1.0] +assert ( + dataset.episodes[0].objects.object_template + == "data/test_assets/objects/chair" +) + +assert dataset.episodes[0].goals.position == [ + 4.3417439460754395, + 0.6726829409599304, + -5.0634379386901855, +] +assert dataset.episodes[0].goals.rotation == [0.0, 0.0, 0.0, 1.0] + + +# %% [markdown] +# ## Implement Grab/Release Action + +# %% +# @title RayCast utility to implement Grab/Release Under Cross-Hair Action +# @markdown Cast a ray in the direction of crosshair from the camera and check if it collides with another object within a certain distance threshold + + +def raycast(sim, sensor_name, crosshair_pos=(128, 128), max_distance=2.0): + r"""Cast a ray in the direction of crosshair and check if it collides + with another object within a certain distance threshold + :param sim: Simulator object + :param sensor_name: name of the visual sensor to be used for raycasting + :param crosshair_pos: 2D coordiante in the viewport towards which the + ray will be cast + :param max_distance: distance threshold beyond which objects won't + be considered + """ + render_camera = sim._sensors[sensor_name]._sensor_object.render_camera + center_ray = render_camera.unproject(mn.Vector2i(crosshair_pos)) + + raycast_results = sim.cast_ray(center_ray, max_distance=max_distance) + + closest_object = -1 + closest_dist = 1000.0 + if raycast_results.has_hits(): + for hit in raycast_results.hits: + if hit.ray_distance < closest_dist: + closest_dist = hit.ray_distance + closest_object = hit.object_id + + return closest_object + + +# %% +# Test the raycast utility. + +with habitat_sim.Simulator(cfg) as sim: + init_agent(sim) + obj_attr_mgr = sim.get_object_template_manager() + obj_attr_mgr.load_configs( + str(os.path.join(data_path, "test_assets/objects")) + ) + obj_path = "test_assets/objects/chair" + chair_template_id = obj_attr_mgr.load_object_configs( + str(os.path.join(data_path, obj_path)) + )[0] + chair_attr = obj_attr_mgr.get_template_by_ID(chair_template_id) + obj_attr_mgr.register_template(chair_attr) + object_id = sim.add_object_by_handle(chair_attr.handle) + print(f"Chair's object id is {object_id}") + + set_object_in_front_of_agent(sim, object_id, -1.5) + sim.set_object_motion_type( + habitat_sim.physics.MotionType.STATIC, object_id + ) + if make_video: + # Visualize the agent's initial position + simulate_and_make_vid( + sim, [190, 128], "sim-before-grab", dt=1.0, open_vid=show_video + ) + + # Distance threshold=2 is greater than agent-to-chair distance. + # Should return chair's object id + closest_object = raycast( + sim, "rgb", crosshair_pos=[128, 190], max_distance=2.0 + ) + print(f"Closest Object ID: {closest_object} using 2.0 threshold") + assert ( + closest_object == object_id + ), f"Could not pick chair with ID: {object_id}" + + # Distance threshold=1 is smaller than agent-to-chair distance . + # Should return -1 + closest_object = raycast( + sim, "rgb", crosshair_pos=[128, 190], max_distance=1.0 + ) + print(f"Closest Object ID: {closest_object} using 1.0 threshold") + assert closest_object == -1, "Agent shoud not be able to pick any object" + + +# %% +# @title Define a Grab/Release action and create a new action space. +# @markdown Each new action is defined by a `ActionSpec` and an `ActuationSpec`. `ActionSpec` is mapping between the action name and its corresponding `ActuationSpec`. `ActuationSpec` contains all the necessary specifications required to define the action. + +from habitat.config.default import _C, CN +from habitat.core.embodied_task import SimulatorTaskAction +from habitat.sims.habitat_simulator.actions import ( + HabitatSimActions, + HabitatSimV1ActionSpaceConfiguration, +) +from habitat_sim.agent.controls.controls import ActuationSpec +from habitat_sim.physics import MotionType + + +# @markdown For instance, `GrabReleaseActuationSpec` contains the following: +# @markdown - `visual_sensor_name` defines which viewport (rgb, depth, etc) to to use to cast the ray. +# @markdown - `crosshair_pos` stores the position in the viewport through which the ray passes. Any object which intersects with this ray can be grabbed by the agent. +# @markdown - `amount` defines a distance threshold. Objects which are farther than the treshold cannot be picked up by the agent. +@attr.s(auto_attribs=True, slots=True) +class GrabReleaseActuationSpec(ActuationSpec): + visual_sensor_name: str = "rgb" + crosshair_pos: List[int] = [128, 128] + amount: float = 2.0 + + +# @markdown Then, we extend the `HabitatSimV1ActionSpaceConfiguration` to add the above action into the agent's action space. `ActionSpaceConfiguration` is a mapping between action name and the corresponding `ActionSpec` +@registry.register_action_space_configuration(name="RearrangementActions-v0") +class RearrangementSimV0ActionSpaceConfiguration( + HabitatSimV1ActionSpaceConfiguration +): + def __init__(self, config): + super().__init__(config) + if not HabitatSimActions.has_action("GRAB_RELEASE"): + HabitatSimActions.extend_action_space("GRAB_RELEASE") + + def get(self): + config = super().get() + new_config = { + HabitatSimActions.GRAB_RELEASE: habitat_sim.ActionSpec( + "grab_or_release_object_under_crosshair", + GrabReleaseActuationSpec( + visual_sensor_name=self.config.VISUAL_SENSOR, + crosshair_pos=self.config.CROSSHAIR_POS, + amount=self.config.GRAB_DISTANCE, + ), + ) + } + + config.update(new_config) + + return config + + +# @markdown Finally, we extend `SimualtorTaskAction` which tells the simulator which action to call when a named action ('GRAB_RELEASE' in this case) is predicte by the agent's policy. +@registry.register_task_action +class GrabOrReleaseAction(SimulatorTaskAction): + def step(self, *args: Any, **kwargs: Any): + r"""This method is called from ``Env`` on each ``step``.""" + return self._sim.step(HabitatSimActions.GRAB_RELEASE) + + +_C.TASK.ACTIONS.GRAB_RELEASE = CN() +_C.TASK.ACTIONS.GRAB_RELEASE.TYPE = "GrabOrReleaseAction" +_C.SIMULATOR.CROSSHAIR_POS = [128, 160] +_C.SIMULATOR.GRAB_DISTANCE = 2.0 +_C.SIMULATOR.VISUAL_SENSOR = "rgb" + +# %% [markdown] +# ##Setup Simulator Class for Rearrangement Task +# +# ![sim](https://drive.google.com/uc?id=1ce6Ti-gpumMEyfomqAKWqOspXm6tN4_8) + +# %% +# @title RearrangementSim Class +# @markdown Here we will extend the `HabitatSim` class for the rearrangement task. We will make the following changes: +# @markdown - define a new `_initialize_objects` function which will load the object in its initial configuration as defined by the episode. +# @markdown - define a `gripped_object_id` property that stores whether the agent is holding any object or not. +# @markdown - modify the `step` function of the simulator to use the `grab/release` action we define earlier. + +# @markdown #### Writing the `step` function: +# @markdown Since we added a new action for this task, we have to modify the `step` function to define what happens when `grab/release` action is called. If a simple navigation action (`move_forward`, `turn_left`, `turn_right`) is called, we pass it forward to `act` function of the agent which already defines the behavior of these actions. + +# @markdown For the `grab/release` action, if the agent is not already holding an object, we first call the `raycast` function using the values from the `ActuationSpec` to see if any object is grippable. If it returns a valid object id, we put the object in a "invisible" inventory and remove it from the scene. + +# @markdown If the agent was already holding an object, `grab/release` action will try release the object at the same relative position as it was grabbed. If the object can be placed without any collision, then the `release` action is successful. + +from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim +from habitat_sim.nav import NavMeshSettings +from habitat_sim.utils.common import quat_from_coeffs, quat_to_magnum + + +@registry.register_simulator(name="RearrangementSim-v0") +class RearrangementSim(HabitatSim): + r"""Simulator wrapper over habitat-sim with + object rearrangement functionalities. + """ + + def __init__(self, config: Config) -> None: + self.did_reset = False + super().__init__(config=config) + self.grip_offset = np.eye(4) + + agent_id = self.habitat_config.DEFAULT_AGENT_ID + agent_config = self._get_agent_config(agent_id) + + self.navmesh_settings = NavMeshSettings() + self.navmesh_settings.set_defaults() + self.navmesh_settings.agent_radius = agent_config.RADIUS + self.navmesh_settings.agent_height = agent_config.HEIGHT + + def reconfigure(self, config: Config) -> None: + super().reconfigure(config) + self._initialize_objects() + + def reset(self): + sim_obs = super().reset() + if self._update_agents_state(): + sim_obs = self.get_sensor_observations() + + self._prev_sim_obs = sim_obs + self.did_reset = True + self.grip_offset = np.eye(4) + return self._sensor_suite.get_observations(sim_obs) + + def _initialize_objects(self): + objects = self.habitat_config.objects[0] + obj_attr_mgr = self.get_object_template_manager() + obj_attr_mgr.load_configs( + str(os.path.join(data_path, "test_assets/objects")) + ) + # first remove all existing objects + existing_object_ids = self.get_existing_object_ids() + + if len(existing_object_ids) > 0: + for obj_id in existing_object_ids: + self.remove_object(obj_id) + + self.sim_object_to_objid_mapping = {} + self.objid_to_sim_object_mapping = {} + + if objects is not None: + object_template = objects["object_template"] + object_pos = objects["position"] + object_rot = objects["rotation"] + + object_template_id = obj_attr_mgr.load_object_configs( + object_template + )[0] + object_attr = obj_attr_mgr.get_template_by_ID(object_template_id) + obj_attr_mgr.register_template(object_attr) + + object_id = self.add_object_by_handle(object_attr.handle) + self.sim_object_to_objid_mapping[object_id] = objects["object_id"] + self.objid_to_sim_object_mapping[objects["object_id"]] = object_id + + self.set_translation(object_pos, object_id) + if isinstance(object_rot, list): + object_rot = quat_from_coeffs(object_rot) + + object_rot = quat_to_magnum(object_rot) + self.set_rotation(object_rot, object_id) + + self.set_object_motion_type(MotionType.STATIC, object_id) + + # Recompute the navmesh after placing all the objects. + self.recompute_navmesh(self.pathfinder, self.navmesh_settings, True) + + def _sync_gripped_object(self, gripped_object_id): + r""" + Sync the gripped object with the object associated with the agent. + """ + if gripped_object_id != -1: + agent_body_transformation = ( + self._default_agent.scene_node.transformation + ) + self.set_transformation( + agent_body_transformation, gripped_object_id + ) + translation = agent_body_transformation.transform_point( + np.array([0, 2.0, 0]) + ) + self.set_translation(translation, gripped_object_id) + + @property + def gripped_object_id(self): + return self._prev_sim_obs.get("gripped_object_id", -1) + + def step(self, action: int): + dt = 1 / 60.0 + self._num_total_frames += 1 + collided = False + gripped_object_id = self.gripped_object_id + + agent_config = self._default_agent.agent_config + action_spec = agent_config.action_space[action] + + if action_spec.name == "grab_or_release_object_under_crosshair": + # If already holding an agent + if gripped_object_id != -1: + agent_body_transformation = ( + self._default_agent.scene_node.transformation + ) + T = np.dot(agent_body_transformation, self.grip_offset) + + self.set_transformation(T, gripped_object_id) + + position = self.get_translation(gripped_object_id) + + if self.pathfinder.is_navigable(position): + self.set_object_motion_type( + MotionType.STATIC, gripped_object_id + ) + gripped_object_id = -1 + self.recompute_navmesh( + self.pathfinder, self.navmesh_settings, True + ) + # if not holding an object, then try to grab + else: + gripped_object_id = raycast( + self, + action_spec.actuation.visual_sensor_name, + crosshair_pos=action_spec.actuation.crosshair_pos, + max_distance=action_spec.actuation.amount, + ) + + # found a grabbable object. + if gripped_object_id != -1: + agent_body_transformation = ( + self._default_agent.scene_node.transformation + ) + + self.grip_offset = np.dot( + np.array(agent_body_transformation.inverted()), + np.array(self.get_transformation(gripped_object_id)), + ) + self.set_object_motion_type( + MotionType.KINEMATIC, gripped_object_id + ) + self.recompute_navmesh( + self.pathfinder, self.navmesh_settings, True + ) + + else: + collided = self._default_agent.act(action) + self._last_state = self._default_agent.get_state() + + # step physics by dt + super().step_world(dt) + + # Sync the gripped object after the agent moves. + self._sync_gripped_object(gripped_object_id) + + # obtain observations + self._prev_sim_obs = self.get_sensor_observations() + self._prev_sim_obs["collided"] = collided + self._prev_sim_obs["gripped_object_id"] = gripped_object_id + + observations = self._sensor_suite.get_observations(self._prev_sim_obs) + return observations + + +# %% [markdown] +# ## Create the Rearrangement Task +# ![task](https://drive.google.com/uc?id=1N75Mmi6aigh33uL765ljsAqLzFmcs7Zn) + +# %% +# @title Implement new sensors and measurements +# @markdown After defining the dataset, action space and simulator functions for the rearrangement task, we are one step closer to training agents to solve this task. + +# @markdown Here we define inputs to the policy and other measurements required to design reward functions. + +# @markdown **Sensors**: These define various part of the simulator state that's visible to the agent. For simplicity, we'll assume that agent knows the object's current position, object's final goal position relative to the agent's current position. +# @markdown - Object's current position will be made given by the `ObjectPosition` sensor +# @markdown - Object's goal position will be available through the `ObjectGoal` sensor. +# @markdown - Finally, we will also use `GrippedObject` sensor to tell the agent if it's holding any object or not. + +# @markdown **Measures**: These define various metrics about the task which can be used to measure task progress and define rewards. Note that measurements are *privileged* information not accessible to the agent as part of the observation space. We will need the following measurements: +# @markdown - `AgentToObjectDistance` which measure the euclidean distance between the agent and the object. +# @markdown - `ObjectToGoalDistance` which measures the euclidean distance between the object and the goal. + +from gym import spaces + +import habitat_sim +from habitat.config.default import CN, Config +from habitat.core.dataset import Episode +from habitat.core.embodied_task import Measure +from habitat.core.simulator import Observations, Sensor, SensorTypes, Simulator +from habitat.tasks.nav.nav import PointGoalSensor + + +@registry.register_sensor +class GrippedObjectSensor(Sensor): + cls_uuid = "gripped_object_id" + + def __init__( + self, *args: Any, sim: RearrangementSim, config: Config, **kwargs: Any + ): + self._sim = sim + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_observation_space(self, *args: Any, **kwargs: Any): + + return spaces.Discrete(len(self._sim.get_existing_object_ids())) + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.MEASUREMENT + + def get_observation( + self, + observations: Dict[str, Observations], + episode: Episode, + *args: Any, + **kwargs: Any, + ): + obj_id = self._sim.sim_object_to_objid_mapping.get( + self._sim.gripped_object_id, -1 + ) + return obj_id + + +@registry.register_sensor +class ObjectPosition(PointGoalSensor): + cls_uuid: str = "object_position" + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (self._dimensionality,) + + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=sensor_shape, + dtype=np.float32, + ) + + def get_observation( + self, *args: Any, observations, episode, **kwargs: Any + ): + agent_state = self._sim.get_agent_state() + agent_position = agent_state.position + rotation_world_agent = agent_state.rotation + + object_id = self._sim.get_existing_object_ids()[0] + object_position = self._sim.get_translation(object_id) + pointgoal = self._compute_pointgoal( + agent_position, rotation_world_agent, object_position + ) + return pointgoal + + +@registry.register_sensor +class ObjectGoal(PointGoalSensor): + cls_uuid: str = "object_goal" + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (self._dimensionality,) + + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=sensor_shape, + dtype=np.float32, + ) + + def get_observation( + self, *args: Any, observations, episode, **kwargs: Any + ): + agent_state = self._sim.get_agent_state() + agent_position = agent_state.position + rotation_world_agent = agent_state.rotation + + goal_position = np.array(episode.goals.position, dtype=np.float32) + + point_goal = self._compute_pointgoal( + agent_position, rotation_world_agent, goal_position + ) + return point_goal + + +@registry.register_measure +class ObjectToGoalDistance(Measure): + """The measure calculates distance of object towards the goal.""" + + cls_uuid: str = "object_to_goal_distance" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + self._config = config + + super().__init__(**kwargs) + + @staticmethod + def _get_uuid(*args: Any, **kwargs: Any): + return ObjectToGoalDistance.cls_uuid + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self.update_metric(*args, episode=episode, **kwargs) + + def _geo_dist(self, src_pos, goal_pos: np.array) -> float: + return self._sim.geodesic_distance(src_pos, [goal_pos]) + + def _euclidean_distance(self, position_a, position_b): + return np.linalg.norm( + np.array(position_b) - np.array(position_a), ord=2 + ) + + def update_metric(self, episode, *args: Any, **kwargs: Any): + sim_obj_id = self._sim.get_existing_object_ids()[0] + + previous_position = np.array( + self._sim.get_translation(sim_obj_id) + ).tolist() + goal_position = episode.goals.position + self._metric = self._euclidean_distance( + previous_position, goal_position + ) + + +@registry.register_measure +class AgentToObjectDistance(Measure): + """The measure calculates the distance of objects from the agent""" + + cls_uuid: str = "agent_to_object_distance" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + self._config = config + + super().__init__(**kwargs) + + @staticmethod + def _get_uuid(*args: Any, **kwargs: Any): + return AgentToObjectDistance.cls_uuid + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self.update_metric(*args, episode=episode, **kwargs) + + def _euclidean_distance(self, position_a, position_b): + return np.linalg.norm( + np.array(position_b) - np.array(position_a), ord=2 + ) + + def update_metric(self, episode, *args: Any, **kwargs: Any): + sim_obj_id = self._sim.get_existing_object_ids()[0] + previous_position = np.array( + self._sim.get_translation(sim_obj_id) + ).tolist() + + agent_state = self._sim.get_agent_state() + agent_position = agent_state.position + + self._metric = self._euclidean_distance( + previous_position, agent_position + ) + + +# ----------------------------------------------------------------------------- +# # REARRANGEMENT TASK GRIPPED OBJECT SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.GRIPPED_OBJECT_SENSOR = CN() +_C.TASK.GRIPPED_OBJECT_SENSOR.TYPE = "GrippedObjectSensor" +# ----------------------------------------------------------------------------- +# # REARRANGEMENT TASK ALL OBJECT POSITIONS SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.OBJECT_POSITION = CN() +_C.TASK.OBJECT_POSITION.TYPE = "ObjectPosition" +_C.TASK.OBJECT_POSITION.GOAL_FORMAT = "POLAR" +_C.TASK.OBJECT_POSITION.DIMENSIONALITY = 2 +# ----------------------------------------------------------------------------- +# # REARRANGEMENT TASK ALL OBJECT GOALS SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.OBJECT_GOAL = CN() +_C.TASK.OBJECT_GOAL.TYPE = "ObjectGoal" +_C.TASK.OBJECT_GOAL.GOAL_FORMAT = "POLAR" +_C.TASK.OBJECT_GOAL.DIMENSIONALITY = 2 +# ----------------------------------------------------------------------------- +# # OBJECT_DISTANCE_TO_GOAL MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.OBJECT_TO_GOAL_DISTANCE = CN() +_C.TASK.OBJECT_TO_GOAL_DISTANCE.TYPE = "ObjectToGoalDistance" +# ----------------------------------------------------------------------------- +# # OBJECT_DISTANCE_FROM_AGENT MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.AGENT_TO_OBJECT_DISTANCE = CN() +_C.TASK.AGENT_TO_OBJECT_DISTANCE.TYPE = "AgentToObjectDistance" + +from habitat.config.default import CN, Config + +# %% +# @title Define `RearrangementTask` by extending `NavigationTask` +from habitat.tasks.nav.nav import NavigationTask, merge_sim_episode_config + + +def merge_sim_episode_with_object_config( + sim_config: Config, episode: Type[Episode] +) -> Any: + sim_config = merge_sim_episode_config(sim_config, episode) + sim_config.defrost() + sim_config.objects = [episode.objects.__dict__] + sim_config.freeze() + + return sim_config + + +@registry.register_task(name="RearrangementTask-v0") +class RearrangementTask(NavigationTask): + r"""Embodied Rearrangement Task + Goal: An agent must place objects at their corresponding goal position. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + + def overwrite_sim_config(self, sim_config, episode): + return merge_sim_episode_with_object_config(sim_config, episode) + + +# %% [markdown] +# ## Implement a hard-coded and an RL agent +# +# + +# %% +# @title Load the `RearrangementTask` in Habitat-Lab and run a hard-coded agent +import habitat + +config = habitat.get_config("configs/tasks/pointnav.yaml") +config.defrost() +config.ENVIRONMENT.MAX_EPISODE_STEPS = 50 +config.SIMULATOR.TYPE = "RearrangementSim-v0" +config.SIMULATOR.ACTION_SPACE_CONFIG = "RearrangementActions-v0" +config.SIMULATOR.GRAB_DISTANCE = 2.0 +config.SIMULATOR.HABITAT_SIM_V0.ENABLE_PHYSICS = True +config.TASK.TYPE = "RearrangementTask-v0" +config.TASK.SUCCESS_DISTANCE = 1.0 +config.TASK.SENSORS = [ + "GRIPPED_OBJECT_SENSOR", + "OBJECT_POSITION", + "OBJECT_GOAL", +] +config.TASK.GOAL_SENSOR_UUID = "object_goal" +config.TASK.MEASUREMENTS = [ + "OBJECT_TO_GOAL_DISTANCE", + "AGENT_TO_OBJECT_DISTANCE", +] +config.TASK.POSSIBLE_ACTIONS = ["STOP", "MOVE_FORWARD", "GRAB_RELEASE"] +config.DATASET.TYPE = "RearrangementDataset-v0" +config.DATASET.SPLIT = "train" +config.DATASET.DATA_PATH = ( + "data/datasets/rearrangement/coda/v1/{split}/{split}.json.gz" +) +config.freeze() + + +def print_info(obs, metrics): + print( + "Gripped Object: {}, Distance To Object: {}, Distance To Goal: {}".format( + obs["gripped_object_id"], + metrics["agent_to_object_distance"], + metrics["object_to_goal_distance"], + ) + ) + + +try: # Got to make initialization idiot proof + sim.close() +except NameError: + pass + +with habitat.Env(config) as env: + obs = env.reset() + obs_list = [] + # Get closer to the object + while True: + obs = env.step(1) + obs_list.append(obs) + metrics = env.get_metrics() + print_info(obs, metrics) + if metrics["agent_to_object_distance"] < 2.0: + break + + # Grab the object + obs = env.step(2) + obs_list.append(obs) + metrics = env.get_metrics() + print_info(obs, metrics) + assert obs["gripped_object_id"] != -1 + + # Get closer to the goal + while True: + obs = env.step(1) + obs_list.append(obs) + metrics = env.get_metrics() + print_info(obs, metrics) + if metrics["object_to_goal_distance"] < 2.0: + break + + # Release the object + obs = env.step(2) + obs_list.append(obs) + metrics = env.get_metrics() + print_info(obs, metrics) + assert obs["gripped_object_id"] == -1 + + if make_video: + make_video_cv2( + obs_list, + [190, 128], + "hard-coded-agent", + fps=5.0, + open_vid=show_video, + ) + +# %% +# @title Create a task specific RL Environment with a new reward definition. +# @markdown We create a `RearragenmentRLEnv` class and modify the `get_reward()` function. +# @markdown The reward sturcture is as follows: +# @markdown - The agent gets a positive reward if the agent gets closer to the object otherwise a negative reward. +# @markdown - The agent gets a positive reward if it moves the object closer to goal otherwise a negative reward. +# @markdown - The agent gets a positive reward when the agent "picks" up an object for the first time. For all other "grab/release" action, it gets a negative reward. +# @markdown - The agent gets a slack penalty of -0.01 for every action it takes in the environment. +# @markdown - Finally the agent gets a large success reward when the episode is completed successfully. + +from typing import Optional, Type + +import numpy as np + +import habitat +from habitat import Config, Dataset +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.common.environments import NavRLEnv + + +@baseline_registry.register_env(name="RearrangementRLEnv") +class RearrangementRLEnv(NavRLEnv): + def __init__(self, config: Config, dataset: Optional[Dataset] = None): + self._prev_measure = { + "agent_to_object_distance": 0.0, + "object_to_goal_distance": 0.0, + "gripped_object_id": -1, + "gripped_object_count": 0, + } + + super().__init__(config, dataset) + + self._success_distance = self._core_env_config.TASK.SUCCESS_DISTANCE + + def reset(self): + self._previous_action = None + observations = super().reset() + + self._prev_measure.update(self.habitat_env.get_metrics()) + self._prev_measure["gripped_object_id"] = -1 + self._prev_measure["gripped_object_count"] = 0 + + return observations + + def step(self, *args, **kwargs): + self._previous_action = kwargs["action"] + return super().step(*args, **kwargs) + + def get_reward_range(self): + return ( + self._rl_config.SLACK_REWARD - 1.0, + self._rl_config.SUCCESS_REWARD + 1.0, + ) + + def get_reward(self, observations): + reward = self._rl_config.SLACK_REWARD + gripped_success_reward = 0.0 + episode_success_reward = 0.0 + agent_to_object_dist_reward = 0.0 + object_to_goal_dist_reward = 0.0 + + action_name = self._env.task.get_action_name( + self._previous_action["action"] + ) + + # If object grabbed, add a success reward + # The reward gets awarded only once for an object. + if ( + action_name == "GRAB_RELEASE" + and observations["gripped_object_id"] >= 0 + ): + obj_id = observations["gripped_object_id"] + self._prev_measure["gripped_object_count"] += 1 + + gripped_success_reward = ( + self._rl_config.GRIPPED_SUCCESS_REWARD + if self._prev_measure["gripped_object_count"] == 1 + else 0.0 + ) + # add a penalty everytime grab/action is called and doesn't do anything + elif action_name == "GRAB_RELEASE": + gripped_success_reward += -0.1 + + self._prev_measure["gripped_object_id"] = observations[ + "gripped_object_id" + ] + + # If the action is not a grab/release action, and the agent + # has not picked up an object, then give reward based on agent to + # object distance. + if ( + action_name != "GRAB_RELEASE" + and self._prev_measure["gripped_object_id"] == -1 + ): + agent_to_object_dist_reward = self.get_agent_to_object_dist_reward( + observations + ) + + # If the action is not a grab/release action, and the agent + # has picked up an object, then give reward based on object to + # to goal distance. + if ( + action_name != "GRAB_RELEASE" + and self._prev_measure["gripped_object_id"] != -1 + ): + object_to_goal_dist_reward = self.get_object_to_goal_dist_reward() + + if ( + self._episode_success(observations) + and self._prev_measure["gripped_object_id"] == -1 + and action_name == "STOP" + ): + episode_success_reward = self._rl_config.SUCCESS_REWARD + + reward += ( + agent_to_object_dist_reward + + object_to_goal_dist_reward + + gripped_success_reward + + episode_success_reward + ) + + return reward + + def get_agent_to_object_dist_reward(self, observations): + """ + Encourage the agent to move towards the closest object which is not already in place. + """ + curr_metric = self._env.get_metrics()["agent_to_object_distance"] + prev_metric = self._prev_measure["agent_to_object_distance"] + dist_reward = prev_metric - curr_metric + + self._prev_measure["agent_to_object_distance"] = curr_metric + + return dist_reward + + def get_object_to_goal_dist_reward(self): + curr_metric = self._env.get_metrics()["object_to_goal_distance"] + prev_metric = self._prev_measure["object_to_goal_distance"] + dist_reward = prev_metric - curr_metric + + self._prev_measure["object_to_goal_distance"] = curr_metric + + return dist_reward + + def _episode_success(self, observations): + r"""Returns True if object is within distance threshold of the goal.""" + dist = self._env.get_metrics()["object_to_goal_distance"] + if ( + abs(dist) > self._success_distance + or observations["gripped_object_id"] != -1 + ): + return False + return True + + def _gripped_success(self, observations): + if ( + observations["gripped_object_id"] >= 0 + and observations["gripped_object_id"] + != self._prev_measure["gripped_object_id"] + ): + return True + + return False + + def get_done(self, observations): + done = False + action_name = self._env.task.get_action_name( + self._previous_action["action"] + ) + if self._env.episode_over or ( + self._episode_success(observations) + and self._prev_measure["gripped_object_id"] == -1 + and action_name == "STOP" + ): + done = True + return done + + def get_info(self, observations): + info = self.habitat_env.get_metrics() + info["episode_success"] = self._episode_success(observations) + return info + + +# %% +import os +import time +from typing import Any, Dict, List, Optional + +import numpy as np +from torch.optim.lr_scheduler import LambdaLR + +from habitat import Config, logger +from habitat.utils.visualizations.utils import observations_to_image +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.common.environments import get_env_class +from habitat_baselines.common.tensorboard_utils import TensorboardWriter +from habitat_baselines.rl.models.rnn_state_encoder import ( + build_rnn_state_encoder, +) +from habitat_baselines.rl.ppo import PPO +from habitat_baselines.rl.ppo.policy import Net, Policy +from habitat_baselines.rl.ppo.ppo_trainer import PPOTrainer +from habitat_baselines.utils.common import batch_obs, generate_video +from habitat_baselines.utils.env_utils import make_env_fn + + +def construct_envs( + config, + env_class, + workers_ignore_signals=False, +): + r"""Create VectorEnv object with specified config and env class type. + To allow better performance, dataset are split into small ones for + each individual env, grouped by scenes. + + :param config: configs that contain num_processes as well as information + :param necessary to create individual environments. + :param env_class: class type of the envs to be created. + :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor + + :return: VectorEnv object created according to specification. + """ + + num_processes = config.NUM_ENVIRONMENTS + configs = [] + env_classes = [env_class for _ in range(num_processes)] + dataset = habitat.datasets.make_dataset(config.TASK_CONFIG.DATASET.TYPE) + scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES + if "*" in config.TASK_CONFIG.DATASET.CONTENT_SCENES: + scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET) + + if num_processes > 1: + if len(scenes) == 0: + raise RuntimeError( + "No scenes to load, multiple process logic relies on being able to split scenes uniquely between processes" + ) + + if len(scenes) < num_processes: + scenes = scenes * num_processes + + random.shuffle(scenes) + + scene_splits = [[] for _ in range(num_processes)] + for idx, scene in enumerate(scenes): + scene_splits[idx % len(scene_splits)].append(scene) + + assert sum(map(len, scene_splits)) == len(scenes) + + for i in range(num_processes): + proc_config = config.clone() + proc_config.defrost() + + task_config = proc_config.TASK_CONFIG + task_config.SEED = task_config.SEED + i + if len(scenes) > 0: + task_config.DATASET.CONTENT_SCENES = scene_splits[i] + + task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = ( + config.SIMULATOR_GPU_ID + ) + + task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS + + proc_config.freeze() + configs.append(proc_config) + + envs = habitat.ThreadedVectorEnv( + make_env_fn=make_env_fn, + env_fn_args=tuple(zip(configs, env_classes)), + workers_ignore_signals=workers_ignore_signals, + ) + return envs + + +class RearrangementBaselinePolicy(Policy): + def __init__(self, observation_space, action_space, hidden_size=512): + super().__init__( + RearrangementBaselineNet( + observation_space=observation_space, hidden_size=hidden_size + ), + action_space.n, + ) + + def from_config(cls, config, envs): + pass + + +class RearrangementBaselineNet(Net): + r"""Network which passes the input image through CNN and concatenates + goal vector with CNN's output and passes that through RNN. + """ + + def __init__(self, observation_space, hidden_size): + super().__init__() + + self._n_input_goal = observation_space.spaces[ + ObjectGoal.cls_uuid + ].shape[0] + + self._hidden_size = hidden_size + + self.state_encoder = build_rnn_state_encoder( + 2 * self._n_input_goal, self._hidden_size + ) + + self.train() + + @property + def output_size(self): + return self._hidden_size + + @property + def is_blind(self): + return False + + @property + def num_recurrent_layers(self): + return self.state_encoder.num_recurrent_layers + + def forward(self, observations, rnn_hidden_states, prev_actions, masks): + object_goal_encoding = observations[ObjectGoal.cls_uuid] + object_pos_encoding = observations[ObjectPosition.cls_uuid] + + x = [object_goal_encoding, object_pos_encoding] + + x = torch.cat(x, dim=1) + x, rnn_hidden_states = self.state_encoder(x, rnn_hidden_states, masks) + + return x, rnn_hidden_states + + +@baseline_registry.register_trainer(name="ppo-rearrangement") +class RearrangementTrainer(PPOTrainer): + supported_tasks = ["RearrangementTask-v0"] + + def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: + r"""Sets up actor critic and agent for PPO. + + Args: + ppo_cfg: config node with relevant params + + Returns: + None + """ + logger.add_filehandler(self.config.LOG_FILE) + + self.actor_critic = RearrangementBaselinePolicy( + observation_space=self.envs.observation_spaces[0], + action_space=self.envs.action_spaces[0], + hidden_size=ppo_cfg.hidden_size, + ) + self.actor_critic.to(self.device) + + self.agent = PPO( + actor_critic=self.actor_critic, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + use_normalized_advantage=ppo_cfg.use_normalized_advantage, + ) + + def _init_envs(self, config=None): + if config is None: + config = self.config + + self.envs = construct_envs(config, get_env_class(config.ENV_NAME)) + + def train(self) -> None: + r"""Main method for training PPO. + + Returns: + None + """ + if self._is_distributed: + raise RuntimeError("This trainer does not support distributed") + self._init_train() + + count_checkpoints = 0 + + lr_scheduler = LambdaLR( + optimizer=self.agent.optimizer, + lr_lambda=lambda _: 1 - self.percent_done(), + ) + ppo_cfg = self.config.RL.PPO + + with TensorboardWriter( + self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) as writer: + while not self.is_done(): + + if ppo_cfg.use_linear_clip_decay: + self.agent.clip_param = ppo_cfg.clip_param * ( + 1 - self.percent_done() + ) + + count_steps_delta = 0 + for _step in range(ppo_cfg.num_steps): + count_steps_delta += self._collect_rollout_step() + + ( + value_loss, + action_loss, + dist_entropy, + ) = self._update_agent() + + if ppo_cfg.use_linear_lr_decay: + lr_scheduler.step() # type: ignore + + losses = self._coalesce_post_step( + dict(value_loss=value_loss, action_loss=action_loss), + count_steps_delta, + ) + self.num_updates_done += 1 + + deltas = { + k: ( + (v[-1] - v[0]).sum().item() + if len(v) > 1 + else v[0].sum().item() + ) + for k, v in self.window_episode_stats.items() + } + deltas["count"] = max(deltas["count"], 1.0) + + writer.add_scalar( + "reward", + deltas["reward"] / deltas["count"], + self.num_steps_done, + ) + + # Check to see if there are any metrics + # that haven't been logged yet + + for k, v in deltas.items(): + if k not in {"reward", "count"}: + writer.add_scalar( + "metric/" + k, + v / deltas["count"], + self.num_steps_done, + ) + + losses = [value_loss, action_loss] + for l, k in zip(losses, ["value, policy"]): + writer.add_scalar("losses/" + k, l, self.num_steps_done) + + # log stats + if self.num_updates_done % self.config.LOG_INTERVAL == 0: + logger.info( + "update: {}\tfps: {:.3f}\t".format( + self.num_updates_done, + self.num_steps_done / (time.time() - self.t_start), + ) + ) + + logger.info( + "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" + "frames: {}".format( + self.num_updates_done, + self.env_time, + self.pth_time, + self.num_steps_done, + ) + ) + + logger.info( + "Average window size: {} {}".format( + len(self.window_episode_stats["count"]), + " ".join( + "{}: {:.3f}".format(k, v / deltas["count"]) + for k, v in deltas.items() + if k != "count" + ), + ) + ) + + # checkpoint model + if self.should_checkpoint(): + self.save_checkpoint( + f"ckpt.{count_checkpoints}.pth", + dict(step=self.num_steps_done), + ) + count_checkpoints += 1 + + self.envs.close() + + def eval(self) -> None: + r"""Evaluates the current model + Returns: + None + """ + + config = self.config.clone() + + if len(self.config.VIDEO_OPTION) > 0: + config.defrost() + config.NUM_ENVIRONMENTS = 1 + config.freeze() + + logger.info(f"env config: {config}") + with construct_envs(config, get_env_class(config.ENV_NAME)) as envs: + observations = envs.reset() + batch = batch_obs(observations, device=self.device) + + current_episode_reward = torch.zeros( + envs.num_envs, 1, device=self.device + ) + ppo_cfg = self.config.RL.PPO + test_recurrent_hidden_states = torch.zeros( + config.NUM_ENVIRONMENTS, + self.actor_critic.net.num_recurrent_layers, + ppo_cfg.hidden_size, + device=self.device, + ) + prev_actions = torch.zeros( + config.NUM_ENVIRONMENTS, + 1, + device=self.device, + dtype=torch.long, + ) + not_done_masks = torch.zeros( + config.NUM_ENVIRONMENTS, + 1, + device=self.device, + dtype=torch.bool, + ) + + rgb_frames = [ + [] for _ in range(self.config.NUM_ENVIRONMENTS) + ] # type: List[List[np.ndarray]] + + if len(config.VIDEO_OPTION) > 0: + os.makedirs(config.VIDEO_DIR, exist_ok=True) + + self.actor_critic.eval() + + for _i in range(config.TASK_CONFIG.ENVIRONMENT.MAX_EPISODE_STEPS): + current_episodes = envs.current_episodes() + + with torch.no_grad(): + ( + _, + actions, + _, + test_recurrent_hidden_states, + ) = self.actor_critic.act( + batch, + test_recurrent_hidden_states, + prev_actions, + not_done_masks, + deterministic=False, + ) + + prev_actions.copy_(actions) + + outputs = envs.step([a[0].item() for a in actions]) + + observations, rewards, dones, infos = [ + list(x) for x in zip(*outputs) + ] + batch = batch_obs(observations, device=self.device) + + not_done_masks = torch.tensor( + [[not done] for done in dones], + dtype=torch.bool, + device="cpu", + ) + + rewards = torch.tensor( + rewards, dtype=torch.float, device=self.device + ).unsqueeze(1) + + current_episode_reward += rewards + + # episode ended + if not not_done_masks[0].item(): + generate_video( + video_option=self.config.VIDEO_OPTION, + video_dir=self.config.VIDEO_DIR, + images=rgb_frames[0], + episode_id=current_episodes[0].episode_id, + checkpoint_idx=0, + metrics=self._extract_scalars_from_info(infos[0]), + tb_writer=None, + ) + + print("Evaluation Finished.") + print("Success: {}".format(infos[0]["episode_success"])) + print( + "Reward: {}".format(current_episode_reward[0].item()) + ) + print( + "Distance To Goal: {}".format( + infos[0]["object_to_goal_distance"] + ) + ) + + return + + # episode continues + elif len(self.config.VIDEO_OPTION) > 0: + frame = observations_to_image(observations[0], infos[0]) + rgb_frames[0].append(frame) + + not_done_masks = not_done_masks.to(device=self.device) + + +# %% +# %load_ext tensorboard +# %tensorboard --logdir data/tb + +# %% +# @title Train an RL agent on a single episode +# !if [ -d "data/tb" ]; then rm -r data/tb; fi + +import random + +import numpy as np +import torch + +import habitat +from habitat import Config +from habitat_baselines.config.default import get_config as get_baseline_config + +baseline_config = get_baseline_config( + "habitat_baselines/config/pointnav/ppo_pointnav.yaml" +) +baseline_config.defrost() + +baseline_config.TASK_CONFIG = config +baseline_config.TRAINER_NAME = "ddppo" +baseline_config.ENV_NAME = "RearrangementRLEnv" +baseline_config.SIMULATOR_GPU_ID = 0 +baseline_config.TORCH_GPU_ID = 0 +baseline_config.VIDEO_OPTION = ["disk"] +baseline_config.TENSORBOARD_DIR = "data/tb" +baseline_config.VIDEO_DIR = "data/videos" +baseline_config.NUM_ENVIRONMENTS = 2 +baseline_config.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] +baseline_config.CHECKPOINT_FOLDER = "data/checkpoints" +baseline_config.TOTAL_NUM_STEPS = -1.0 + +if vut.is_notebook(): + baseline_config.NUM_UPDATES = 400 # @param {type:"number"} +else: + baseline_config.NUM_UPDATES = 1 + +baseline_config.LOG_INTERVAL = 10 +baseline_config.NUM_CHECKPOINTS = 5 +baseline_config.LOG_FILE = "data/checkpoints/train.log" +baseline_config.EVAL.SPLIT = "train" +baseline_config.RL.SUCCESS_REWARD = 2.5 # @param {type:"number"} +baseline_config.RL.SUCCESS_MEASURE = "object_to_goal_distance" +baseline_config.RL.REWARD_MEASURE = "object_to_goal_distance" +baseline_config.RL.GRIPPED_SUCCESS_REWARD = 2.5 # @param {type:"number"} + +baseline_config.freeze() +random.seed(baseline_config.TASK_CONFIG.SEED) +np.random.seed(baseline_config.TASK_CONFIG.SEED) +torch.manual_seed(baseline_config.TASK_CONFIG.SEED) + +if __name__ == "__main__": + trainer = RearrangementTrainer(baseline_config) + trainer.train() + trainer.eval() + + if make_video: + video_file = os.listdir("data/videos")[0] + vut.display_video(os.path.join("data/videos", video_file)) diff --git a/habitat-lab-dialog/examples/tutorials/nb_python/Habitat_Lab.py b/habitat-lab-dialog/examples/tutorials/nb_python/Habitat_Lab.py new file mode 100644 index 0000000..869c4da --- /dev/null +++ b/habitat-lab-dialog/examples/tutorials/nb_python/Habitat_Lab.py @@ -0,0 +1,431 @@ +# --- +# jupyter: +# accelerator: GPU +# colab: +# collapsed_sections: [] +# name: Habitat Lab +# provenance: [] +# jupytext: +# cell_metadata_filter: -all +# formats: nb_python//py:percent,colabs//ipynb +# notebook_metadata_filter: all +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.5.2 +# kernelspec: +# display_name: Python 3 +# name: python3 +# --- + +# %% +# @title Installation + +# !curl -L https://raw.githubusercontent.com/facebookresearch/habitat-sim/master/examples/colab_utils/colab_install.sh | NIGHTLY=true bash -s +# !wget -c http://dl.fbaipublicfiles.com/habitat/mp3d_example.zip && unzip -o mp3d_example.zip -d /content/habitat-sim/data/scene_datasets/mp3d/ + +# %% +# !pip uninstall --yes pyopenssl +# !pip install pyopenssl + +# %% +# @title Colab Setup and Imports { display-mode: "form" } +# @markdown (double click to see the code) + +import os +import random +import sys + +import git +import numpy as np +from gym import spaces + +# %matplotlib inline +from matplotlib import pyplot as plt + +# %cd "/content/habitat-lab" + + +if "google.colab" in sys.modules: + # This tells imageio to use the system FFMPEG that has hardware acceleration. + os.environ["IMAGEIO_FFMPEG_EXE"] = "/usr/bin/ffmpeg" +repo = git.Repo(".", search_parent_directories=True) +dir_path = repo.working_tree_dir +# %cd $dir_path + +from PIL import Image + +import habitat +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat.tasks.nav.nav import NavigationTask +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.config.default import get_config as get_baselines_config + +# %% +# @title Define Observation Display Utility Function { display-mode: "form" } + +# @markdown A convenient function that displays sensor observations with matplotlib. + +# @markdown (double click to see the code) + + +# Change to do something like this maybe: https://stackoverflow.com/a/41432704 +def display_sample( + rgb_obs, semantic_obs=np.array([]), depth_obs=np.array([]) +): # noqa B006 + from habitat_sim.utils.common import d3_40_colors_rgb + + rgb_img = Image.fromarray(rgb_obs, mode="RGB") + + arr = [rgb_img] + titles = ["rgb"] + if semantic_obs.size != 0: + semantic_img = Image.new( + "P", (semantic_obs.shape[1], semantic_obs.shape[0]) + ) + semantic_img.putpalette(d3_40_colors_rgb.flatten()) + semantic_img.putdata((semantic_obs.flatten() % 40).astype(np.uint8)) + semantic_img = semantic_img.convert("RGBA") + arr.append(semantic_img) + titles.append("semantic") + + if depth_obs.size != 0: + depth_img = Image.fromarray( + (depth_obs / 10 * 255).astype(np.uint8), mode="L" + ) + arr.append(depth_img) + titles.append("depth") + + plt.figure(figsize=(12, 8)) + for i, data in enumerate(arr): + ax = plt.subplot(1, 3, i + 1) + ax.axis("off") + ax.set_title(titles[i]) + plt.imshow(data) + plt.show(block=False) + + +# %% [markdown] +# ## Setup PointNav Task + +# %% +# cat "./configs/test/habitat_all_sensors_test.yaml" + +# %% +if __name__ == "__main__": + config = habitat.get_config( + config_paths="./configs/test/habitat_all_sensors_test.yaml" + ) + + try: + env.close() + except NameError: + pass + env = habitat.Env(config=config) + +# %% + action = None + obs = env.reset() + valid_actions = ["TURN_LEFT", "TURN_RIGHT", "MOVE_FORWARD", "STOP"] + interactive_control = False # @param {type:"boolean"} + while action != "STOP": + display_sample(obs["rgb"]) + print( + "distance to goal: {:.2f}".format( + obs["pointgoal_with_gps_compass"][0] + ) + ) + print( + "angle to goal (radians): {:.2f}".format( + obs["pointgoal_with_gps_compass"][1] + ) + ) + if interactive_control: + action = input( + "enter action out of {}:\n".format(", ".join(valid_actions)) + ) + assert ( + action in valid_actions + ), "invalid action {} entered, choose one amongst " + ",".join( + valid_actions + ) + else: + action = valid_actions.pop() + obs = env.step( + { + "action": action, + } + ) + + env.close() + +# %% + print(env.get_metrics()) + +# %% [markdown] +# ## RL Training + +# %% +if __name__ == "__main__": + config = get_baselines_config( + "./habitat_baselines/config/pointnav/ppo_pointnav_example.yaml" + ) + +# %% +# set random seeds +if __name__ == "__main__": + seed = "42" # @param {type:"string"} + steps_in_thousands = "10" # @param {type:"string"} + + config.defrost() + config.TASK_CONFIG.SEED = int(seed) + config.TOTAL_NUM_STEPS = int(steps_in_thousands) + config.LOG_INTERVAL = 1 + config.freeze() + + random.seed(config.TASK_CONFIG.SEED) + np.random.seed(config.TASK_CONFIG.SEED) + +# %% +if __name__ == "__main__": + trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME) + trainer = trainer_init(config) + trainer.train() + +# %% +# @markdown (double click to see the code) + +# example tensorboard visualization +# for more details refer to [link](https://github.com/facebookresearch/habitat-lab/tree/master/habitat_baselines#additional-utilities). + +try: + from IPython import display + + with open("./res/img/tensorboard_video_demo.gif", "rb") as f: + display.display(display.Image(data=f.read(), format="png")) +except ImportError: + pass +# %% [markdown] +# ## Key Concepts +# +# All the concepts link to their definitions: +# +# 1. [`habitat.sims.habitat_simulator.HabitatSim`](https://github.com/facebookresearch/habitat-lab/blob/master/habitat/sims/habitat_simulator/habitat_simulator.py#L159) +# Thin wrapper over `habitat_sim` providing seamless integration with experimentation framework. +# +# +# 2. [`habitat.core.env.Env`](https://github.com/facebookresearch/habitat-lab/blob/master/habitat/core/env.py) +# Abstraction for the universe of agent, task and simulator. Agents that you train and evaluate operate inside the environment. +# +# +# 3. [`habitat.core.env.RLEnv`](https://github.com/facebookresearch/habitat-lab/blob/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat/core/env.py#L278) +# Extends the `Env` class for reinforcement learning by defining the reward and other required components. +# +# +# 4. [`habitat.core.embodied_task.EmbodiedTask`](https://github.com/facebookresearch/habitat-lab/blob/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat/core/embodied_task.py#L242) +# Defines the task that the agent needs to solve. This class holds the definition of observation space, action space, measures, simulator usage. Eg: PointNav, ObjectNav. +# +# +# 5. [`habitat.core.dataset.Dataset`](https://github.com/facebookresearch/habitat-lab/blob/4b6da1c4f8eb287cea43e70c50fe1d615a261198/habitat/core/dataset.py#L63) +# Wrapper over information required for the dataset of embodied task, contains definition and interaction with an `episode`. +# +# +# 6. [`habitat.core.embodied_task.Measure`](https://github.com/facebookresearch/habitat-lab/blob/master/habitat/core/embodied_task.py#L82) +# Defines the metrics for embodied task, eg: [SPL](https://github.com/facebookresearch/habitat-lab/blob/d0db1b55be57abbacc5563dca2ca14654c545552/habitat/tasks/nav/nav.py#L533). +# +# +# 7. [`habitat_baselines`](https://github.com/facebookresearch/habitat-lab/tree/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat_baselines) +# RL, SLAM, heuristic baseline implementations for the different embodied tasks. + +# %% [markdown] +# ## Create a new Task + +# %% +if __name__ == "__main__": + config = habitat.get_config( + config_paths="./configs/test/habitat_all_sensors_test.yaml" + ) + + +@registry.register_task(name="TestNav-v0") +class NewNavigationTask(NavigationTask): + def __init__(self, config, sim, dataset): + logger.info("Creating a new type of task") + super().__init__(config=config, sim=sim, dataset=dataset) + + def _check_episode_is_active(self, *args, **kwargs): + logger.info( + "Current agent position: {}".format(self._sim.get_agent_state()) + ) + collision = self._sim.previous_step_collided + stop_called = not getattr(self, "is_stop_called", False) + return collision or stop_called + + +if __name__ == "__main__": + config.defrost() + config.TASK.TYPE = "TestNav-v0" + config.freeze() + + try: + env.close() + except NameError: + pass + env = habitat.Env(config=config) + +# %% + action = None + env.reset() + valid_actions = ["TURN_LEFT", "TURN_RIGHT", "MOVE_FORWARD", "STOP"] + interactive_control = False # @param {type:"boolean"} + while env.episode_over is not True: + display_sample(obs["rgb"]) + if interactive_control: + action = input( + "enter action out of {}:\n".format(", ".join(valid_actions)) + ) + assert ( + action in valid_actions + ), "invalid action {} entered, choose one amongst " + ",".join( + valid_actions + ) + else: + action = valid_actions.pop() + obs = env.step( + { + "action": action, + "action_args": None, + } + ) + print("Episode over:", env.episode_over) + + env.close() + + +# %% [markdown] +# ## Create a new Sensor + +# %% +@registry.register_sensor(name="agent_position_sensor") +class AgentPositionSensor(habitat.Sensor): + def __init__(self, sim, config, **kwargs): + super().__init__(config=config) + self._sim = sim + + # Defines the name of the sensor in the sensor suite dictionary + def _get_uuid(self, *args, **kwargs): + return "agent_position" + + # Defines the type of the sensor + def _get_sensor_type(self, *args, **kwargs): + return habitat.SensorTypes.POSITION + + # Defines the size and range of the observations of the sensor + def _get_observation_space(self, *args, **kwargs): + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=(3,), + dtype=np.float32, + ) + + # This is called whenver reset is called or an action is taken + def get_observation(self, observations, *args, episode, **kwargs): + return self._sim.get_agent_state().position + + +# %% +if __name__ == "__main__": + config = habitat.get_config( + config_paths="./configs/test/habitat_all_sensors_test.yaml" + ) + + config.defrost() + # Now define the config for the sensor + config.TASK.AGENT_POSITION_SENSOR = habitat.Config() + # Use the custom name + config.TASK.AGENT_POSITION_SENSOR.TYPE = "agent_position_sensor" + # Add the sensor to the list of sensors in use + config.TASK.SENSORS.append("AGENT_POSITION_SENSOR") + config.freeze() + + try: + env.close() + except NameError: + pass + env = habitat.Env(config=config) + +# %% + obs = env.reset() + +# %% + obs.keys() + +# %% + print(obs["agent_position"]) + +# %% + env.close() + +# %% [markdown] +# ## Create a new Agent + +# %% +# An example agent which can be submitted to habitat-challenge. +# To participate and for more details refer to: +# - https://aihabitat.org/challenge/2020/ +# - https://github.com/facebookresearch/habitat-challenge + + +class ForwardOnlyAgent(habitat.Agent): + def __init__(self, success_distance, goal_sensor_uuid): + self.dist_threshold_to_stop = success_distance + self.goal_sensor_uuid = goal_sensor_uuid + + def reset(self): + pass + + def is_goal_reached(self, observations): + dist = observations[self.goal_sensor_uuid][0] + return dist <= self.dist_threshold_to_stop + + def act(self, observations): + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + action = HabitatSimActions.MOVE_FORWARD + return {"action": action} + + +# %% [markdown] +# ### Other Examples +# +# [Create a new action space](https://github.com/facebookresearch/habitat-lab/blob/master/examples/new_actions.py) + +# %% +# @title Sim2Real with Habitat { display-mode: "form" } + +try: + from IPython.display import HTML + + HTML( + '' + ) +except ImportError: + pass + +# %% [markdown] +# Deploy habitat-sim trained models on real robots with the [habitat-pyrobot bridge](https://github.com/facebookresearch/habitat-lab/blob/71d409ab214a7814a9bd9b7e44fd25f57a0443ba/habitat/sims/pyrobot/pyrobot.py) +# +# ```python +# # Are we in sim or reality? +# if args.use_robot: # Use LoCoBot via PyRobot +# config.SIMULATOR.TYPE = "PyRobot-Locobot-v0" +# else: # Use simulation +# config.SIMULATOR.TYPE = "Habitat-Sim-v0" +# ``` +# +# Paper: [https://arxiv.org/abs/1912.06321](https://arxiv.org/abs/1912.06321) diff --git a/habitat-lab-dialog/examples/visualization_examples.py b/habitat-lab-dialog/examples/visualization_examples.py new file mode 100644 index 0000000..3bb5923 --- /dev/null +++ b/habitat-lab-dialog/examples/visualization_examples.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import os + +import imageio +import numpy as np + +import habitat +from habitat.tasks.nav.nav import NavigationEpisode, NavigationGoal +from habitat.utils.visualizations import maps + +IMAGE_DIR = os.path.join("examples", "images") +if not os.path.exists(IMAGE_DIR): + os.makedirs(IMAGE_DIR) + + +def example_pointnav_draw_target_birdseye_view(): + goal_radius = 0.5 + goal = NavigationGoal(position=[10, 0.25, 10], radius=goal_radius) + agent_position = np.array([0, 0.25, 0]) + agent_rotation = -np.pi / 4 + + dummy_episode = NavigationEpisode( + goals=[goal], + episode_id="dummy_id", + scene_id="dummy_scene", + start_position=agent_position, + start_rotation=agent_rotation, + ) + target_image = maps.pointnav_draw_target_birdseye_view( + agent_position, + agent_rotation, + np.asarray(dummy_episode.goals[0].position), + goal_radius=dummy_episode.goals[0].radius, + agent_radius_px=25, + ) + + imageio.imsave( + os.path.join(IMAGE_DIR, "pointnav_target_image.png"), target_image + ) + + +def example_pointnav_draw_target_birdseye_view_agent_on_border(): + goal_radius = 0.5 + goal = NavigationGoal(position=[0, 0.25, 0], radius=goal_radius) + ii = 0 + for x_edge in [-1, 0, 1]: + for y_edge in [-1, 0, 1]: + if not np.bitwise_xor(x_edge == 0, y_edge == 0): + continue + ii += 1 + agent_position = np.array([7.8 * x_edge, 0.25, 7.8 * y_edge]) + agent_rotation = np.pi / 2 + + dummy_episode = NavigationEpisode( + goals=[goal], + episode_id="dummy_id", + scene_id="dummy_scene", + start_position=agent_position, + start_rotation=agent_rotation, + ) + target_image = maps.pointnav_draw_target_birdseye_view( + agent_position, + agent_rotation, + np.asarray(dummy_episode.goals[0].position), + goal_radius=dummy_episode.goals[0].radius, + agent_radius_px=25, + ) + imageio.imsave( + os.path.join( + IMAGE_DIR, "pointnav_target_image_edge_%d.png" % ii + ), + target_image, + ) + + +def example_get_topdown_map(): + config = habitat.get_config(config_paths="configs/tasks/pointnav.yaml") + dataset = habitat.make_dataset( + id_dataset=config.DATASET.TYPE, config=config.DATASET + ) + with habitat.Env(config=config, dataset=dataset) as env: + env.reset() + top_down_map = maps.get_topdown_map_from_sim( + env.sim, map_resolution=1024 + ) + recolor_map = np.array( + [[255, 255, 255], [128, 128, 128], [0, 0, 0]], dtype=np.uint8 + ) + top_down_map = recolor_map[top_down_map] + imageio.imsave( + os.path.join(IMAGE_DIR, "top_down_map.png"), top_down_map + ) + + +def main(): + example_pointnav_draw_target_birdseye_view() + example_get_topdown_map() + example_pointnav_draw_target_birdseye_view_agent_on_border() + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/examples/vln_benchmark.py b/habitat-lab-dialog/examples/vln_benchmark.py new file mode 100644 index 0000000..4bd448b --- /dev/null +++ b/habitat-lab-dialog/examples/vln_benchmark.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +from collections import defaultdict +from typing import Dict + +import habitat +from habitat.config.default import get_config +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower + + +def reference_path_benchmark(config, num_episodes=None): + """ + Custom benchmark for the reference path agent because it requires access + to habitat_env during each episode. Agent follows the ground truth + reference path by navigating to intermediate viewpoints en route to goal. + Args: + config: Config + num_episodes: Count of episodes to evaluate on. + """ + with habitat.Env(config=config) as env: + if num_episodes is None: + num_episodes = len(env.episodes) + + follower = ShortestPathFollower( + env.sim, goal_radius=0.5, return_one_hot=False + ) + follower.mode = "geodesic_path" + + agg_metrics: Dict = defaultdict(float) + for _ in range(num_episodes): + env.reset() + + for point in env.current_episode.reference_path: + while not env.episode_over: + best_action = follower.get_next_action(point) + if best_action == None: + break + env.step(best_action) + + while not env.episode_over: + best_action = follower.get_next_action( + env.current_episode.goals[0].position + ) + if best_action == None: + best_action = HabitatSimActions.STOP + env.step(best_action) + + for m, v in env.get_metrics().items(): + agg_metrics[m] += v + + avg_metrics = {k: v / num_episodes for k, v in agg_metrics.items()} + return avg_metrics + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--task-config", type=str, default="configs/tasks/vln_r2r.yaml" + ) + args = parser.parse_args() + config = get_config(args.task_config) + + metrics = reference_path_benchmark(config, num_episodes=10) + + print("Benchmark for Reference Path Follower agent:") + for k, v in metrics.items(): + print("{}: {:.3f}".format(k, v)) + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/examples/vln_reference_path_follower_example.py b/habitat-lab-dialog/examples/vln_reference_path_follower_example.py new file mode 100644 index 0000000..9e7ce9a --- /dev/null +++ b/habitat-lab-dialog/examples/vln_reference_path_follower_example.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import shutil + +import numpy as np + +import habitat +from examples.shortest_path_follower_example import ( + SimpleRLEnv, + draw_top_down_map, +) +from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower +from habitat.utils.visualizations.utils import ( + append_text_to_image, + images_to_video, +) + +IMAGE_DIR = os.path.join("examples", "images") +if not os.path.exists(IMAGE_DIR): + os.makedirs(IMAGE_DIR) + + +def save_map(observations, info, images): + im = observations["rgb"] + top_down_map = draw_top_down_map(info, im.shape[0]) + output_im = np.concatenate((im, top_down_map), axis=1) + output_im = append_text_to_image( + output_im, observations["instruction"]["text"] + ) + images.append(output_im) + + +def reference_path_example(mode): + """ + Saves a video of a shortest path follower agent navigating from a start + position to a goal. Agent follows the ground truth reference path by + navigating to intermediate viewpoints en route to goal. + Args: + mode: 'geodesic_path' or 'greedy' + """ + config = habitat.get_config( + config_paths="configs/test/habitat_r2r_vln_test.yaml" + ) + config.defrost() + config.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.TASK.SENSORS.append("HEADING_SENSOR") + config.freeze() + with SimpleRLEnv(config=config) as env: + follower = ShortestPathFollower( + env.habitat_env.sim, goal_radius=0.5, return_one_hot=False + ) + follower.mode = mode + print("Environment creation successful") + + for episode in range(3): + env.reset() + episode_id = env.habitat_env.current_episode.episode_id + print( + f"Agent stepping around inside environment. Episode id: {episode_id}" + ) + + dirname = os.path.join( + IMAGE_DIR, "vln_reference_path_example", mode, "%02d" % episode + ) + if os.path.exists(dirname): + shutil.rmtree(dirname) + os.makedirs(dirname) + + images = [] + steps = 0 + reference_path = env.habitat_env.current_episode.reference_path + [ + env.habitat_env.current_episode.goals[0].position + ] + for point in reference_path: + done = False + while not done: + best_action = follower.get_next_action(point) + if best_action == None: + break + observations, reward, done, info = env.step(best_action) + save_map(observations, info, images) + steps += 1 + + print(f"Navigated to goal in {steps} steps.") + images_to_video(images, dirname, str(episode_id)) + images = [] + + +if __name__ == "__main__": + reference_path_example("geodesic_path") diff --git a/habitat-lab-dialog/habitat/__init__.py b/habitat-lab-dialog/habitat/__init__.py new file mode 100644 index 0000000..5c90fbd --- /dev/null +++ b/habitat-lab-dialog/habitat/__init__.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.config import Config, get_config +from habitat.core.agent import Agent +from habitat.core.benchmark import Benchmark +from habitat.core.challenge import Challenge +from habitat.core.dataset import Dataset +from habitat.core.embodied_task import EmbodiedTask, Measure, Measurements +from habitat.core.env import Env, RLEnv +from habitat.core.logging import logger +from habitat.core.registry import registry # noqa : F401 +from habitat.core.simulator import Sensor, SensorSuite, SensorTypes, Simulator +from habitat.core.vector_env import ThreadedVectorEnv, VectorEnv +from habitat.datasets import make_dataset +from habitat.version import VERSION as __version__ # noqa + +__all__ = [ + "Agent", + "Benchmark", + "Challenge", + "Config", + "Dataset", + "EmbodiedTask", + "Env", + "get_config", + "logger", + "make_dataset", + "Measure", + "Measurements", + "RLEnv", + "Sensor", + "SensorSuite", + "SensorTypes", + "Simulator", + "ThreadedVectorEnv", + "VectorEnv", +] diff --git a/habitat-lab-dialog/habitat/config/__init__.py b/habitat-lab-dialog/habitat/config/__init__.py new file mode 100644 index 0000000..036b0fa --- /dev/null +++ b/habitat-lab-dialog/habitat/config/__init__.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r"""Habitat Lab Configuration +============================== + +Habitat Lab uses [Yacs configuration system](https://github.com/rbgirshick/yacs) +with the paradigm of `your code + a YACS config for experiment E (+ +external dependencies + hardware + other nuisance terms ...) = +reproducible experiment E`. Yacs advantages: +- Checks for type consistency. +- All parameters and default values are searchable in the code. +- A parameter doesn't need to be set always as each parameter can have a + default value. +- Ability to freeze config to prevent unintended changes. + +## Config usage +An example of how to merge default config with 2 others configs and overwrite +one parameter that could come from the command line: +``` + merged_config = get_config( + config_paths=["configs/tasks/pointnav.yaml", + "configs/dataset/val.yaml"], + opts=["ENVIRONMENT.MAX_EPISODE_STEPS", steps_limit] + ) + +``` + +## Config structure +Below is the structure of config used for Habitat: +- Environment +- Task + - Sensors + - Measurements +- Simulator + - Agent + - Sensors +- Dataset + +We use node names (e.g. `SENSORS: ['RGB_SENSOR', 'DEPTH_SENSOR']`) instead of list +of config nodes (e.g. `SENSORS: [{TYPE = "HabitatSimDepthSensor", +MIN_DEPTH = 0}, ...]`) to declare the Sensors attached to an Agent or Measures +enabled for the Task . With this approach, it's still easy to overwrite a +particular sensor parameter in yaml file without redefining the whole sensor +config. + +## Extending the config without defaults +Create a YAML file and add new fields and values. Load the custom config using +`habitat.get_config()` and defined fields will be merged in default Habitat config: +``` +import habitat +import argparse +from typing import List, Optional, Union + +config = habitat.get_config("{path to user define yaml config}") +env = habitat.Env(config) +``` + +## Extending the config with default values +Example of how to extend a config outside of `habtiat-lab` repository. +First, we create a config extending the default config in the code and re-use +`habitat.get_config()`: +``` +import habitat +import argparse +from typing import List, Optional, Union + +_C = habitat.get_config() +_C.defrost() +# Add new parameters to the config +_C.TASK.EPISODE_INFO = habitat.Config() +_C.TASK.EPISODE_INFO.TYPE = "EpisodeInfo" +_C.TASK.EPISODE_INFO.VALUE = 5 +_C.TASK.MEASUREMENTS.append("EPISODE_INFO") + +# New function returning extended Habitat config that should be used instead +# of habitat.get_config() +def my_get_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None, +) -> habitat.Config: + CONFIG_FILE_SEPARATOR = "," + config = _C.clone() + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + + if opts: + config.merge_from_list(opts) + + config.freeze() + return config + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--task-config", + type=str, + default="configs/tasks/pointnav.yaml," + "configs/datasets/pointnav/habitat_test.yaml", + ) + parser.add_argument( + "opts", + default=None, + nargs=argparse.REMAINDER, + help="Modify config options from command line", + ) + args = parser.parse_args() + config = my_get_config(config_paths=args.task_config, opts=args.opts) + env = habitat.Env(config) + +```""" + +from habitat.config.default import Config, get_config + +__all__ = ["Config", "get_config"] diff --git a/habitat-lab-dialog/habitat/config/default.py b/habitat-lab-dialog/habitat/config/default.py new file mode 100644 index 0000000..57bdca0 --- /dev/null +++ b/habitat-lab-dialog/habitat/config/default.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Union + +import yacs.config + + +# Default Habitat config node +class Config(yacs.config.CfgNode): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs, new_allowed=True) + + +CN = Config + +DEFAULT_CONFIG_DIR = "configs/" +CONFIG_FILE_SEPARATOR = "," + +# ----------------------------------------------------------------------------- +# Config definition +# ----------------------------------------------------------------------------- +_C = CN() +_C.SEED = 100 +# ----------------------------------------------------------------------------- +# ENVIRONMENT +# ----------------------------------------------------------------------------- +_C.ENVIRONMENT = CN() +_C.ENVIRONMENT.MAX_EPISODE_STEPS = 1000 +_C.ENVIRONMENT.MAX_EPISODE_SECONDS = 10000000 +_C.ENVIRONMENT.ITERATOR_OPTIONS = CN() +_C.ENVIRONMENT.ITERATOR_OPTIONS.CYCLE = True +_C.ENVIRONMENT.ITERATOR_OPTIONS.SHUFFLE = True +_C.ENVIRONMENT.ITERATOR_OPTIONS.GROUP_BY_SCENE = True +_C.ENVIRONMENT.ITERATOR_OPTIONS.NUM_EPISODE_SAMPLE = -1 +_C.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_EPISODES = -1 +_C.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS = int(1e4) +_C.ENVIRONMENT.ITERATOR_OPTIONS.STEP_REPETITION_RANGE = 0.2 +# ----------------------------------------------------------------------------- +# TASK +# ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- +# # NAVIGATION TASK +# ----------------------------------------------------------------------------- +_C.TASK = CN() +_C.TASK.TYPE = "Nav-v0" +_C.TASK.SUCCESS_DISTANCE = 0.2 +_C.TASK.SENSORS = [] +_C.TASK.MEASUREMENTS = [] +_C.TASK.GOAL_SENSOR_UUID = "pointgoal" +_C.TASK.POSSIBLE_ACTIONS = ["STOP", "MOVE_FORWARD", "TURN_LEFT", "TURN_RIGHT"] +# ----------------------------------------------------------------------------- +# # ACTIONS +# ----------------------------------------------------------------------------- +ACTIONS = CN() +ACTIONS.STOP = CN() +ACTIONS.STOP.TYPE = "StopAction" +# ----------------------------------------------------------------------------- +# # NAVIGATION ACTIONS +# ----------------------------------------------------------------------------- +ACTIONS.MOVE_FORWARD = CN() +ACTIONS.MOVE_FORWARD.TYPE = "MoveForwardAction" +ACTIONS.TURN_LEFT = CN() +ACTIONS.TURN_LEFT.TYPE = "TurnLeftAction" +ACTIONS.TURN_RIGHT = CN() +ACTIONS.TURN_RIGHT.TYPE = "TurnRightAction" +ACTIONS.LOOK_UP = CN() +ACTIONS.LOOK_UP.TYPE = "LookUpAction" +ACTIONS.LOOK_DOWN = CN() +ACTIONS.LOOK_DOWN.TYPE = "LookDownAction" +ACTIONS.TELEPORT = CN() +ACTIONS.TELEPORT.TYPE = "TeleportAction" + +_C.TASK.ACTIONS = ACTIONS +# ----------------------------------------------------------------------------- +# # TASK SENSORS +# ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- +# POINTGOAL SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.POINTGOAL_SENSOR = CN() +_C.TASK.POINTGOAL_SENSOR.TYPE = "PointGoalSensor" +_C.TASK.POINTGOAL_SENSOR.GOAL_FORMAT = "POLAR" +_C.TASK.POINTGOAL_SENSOR.DIMENSIONALITY = 2 +# ----------------------------------------------------------------------------- +# POINTGOAL WITH GPS+COMPASS SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.POINTGOAL_WITH_GPS_COMPASS_SENSOR = _C.TASK.POINTGOAL_SENSOR.clone() +_C.TASK.POINTGOAL_WITH_GPS_COMPASS_SENSOR.TYPE = ( + "PointGoalWithGPSCompassSensor" +) +# ----------------------------------------------------------------------------- +# OBJECTGOAL SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.OBJECTGOAL_SENSOR = CN() +_C.TASK.OBJECTGOAL_SENSOR.TYPE = "ObjectGoalSensor" +_C.TASK.OBJECTGOAL_SENSOR.GOAL_SPEC = "TASK_CATEGORY_ID" +_C.TASK.OBJECTGOAL_SENSOR.GOAL_SPEC_MAX_VAL = 50 +# ----------------------------------------------------------------------------- +# IMAGEGOAL SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.IMAGEGOAL_SENSOR = CN() +_C.TASK.IMAGEGOAL_SENSOR.TYPE = "ImageGoalSensor" +# ----------------------------------------------------------------------------- +# HEADING SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.HEADING_SENSOR = CN() +_C.TASK.HEADING_SENSOR.TYPE = "HeadingSensor" +# ----------------------------------------------------------------------------- +# COMPASS SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.COMPASS_SENSOR = CN() +_C.TASK.COMPASS_SENSOR.TYPE = "CompassSensor" +# ----------------------------------------------------------------------------- +# GPS SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.GPS_SENSOR = CN() +_C.TASK.GPS_SENSOR.TYPE = "GPSSensor" +_C.TASK.GPS_SENSOR.DIMENSIONALITY = 2 +# ----------------------------------------------------------------------------- +# PROXIMITY SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.PROXIMITY_SENSOR = CN() +_C.TASK.PROXIMITY_SENSOR.TYPE = "ProximitySensor" +_C.TASK.PROXIMITY_SENSOR.MAX_DETECTION_RADIUS = 2.0 +# ----------------------------------------------------------------------------- +# SUCCESS MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.SUCCESS = CN() +_C.TASK.SUCCESS.TYPE = "Success" +_C.TASK.SUCCESS.SUCCESS_DISTANCE = 0.2 +# ----------------------------------------------------------------------------- +# SPL MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.SPL = CN() +_C.TASK.SPL.TYPE = "SPL" +# ----------------------------------------------------------------------------- +# SOFT-SPL MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.SOFT_SPL = CN() +_C.TASK.SOFT_SPL.TYPE = "SoftSPL" +# ----------------------------------------------------------------------------- +# TopDownMap MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.TOP_DOWN_MAP = CN() +_C.TASK.TOP_DOWN_MAP.TYPE = "TopDownMap" +_C.TASK.TOP_DOWN_MAP.MAX_EPISODE_STEPS = _C.ENVIRONMENT.MAX_EPISODE_STEPS +_C.TASK.TOP_DOWN_MAP.MAP_PADDING = 3 +_C.TASK.TOP_DOWN_MAP.MAP_RESOLUTION = 1024 +_C.TASK.TOP_DOWN_MAP.DRAW_SOURCE = True +_C.TASK.TOP_DOWN_MAP.DRAW_BORDER = True +_C.TASK.TOP_DOWN_MAP.DRAW_SHORTEST_PATH = True +_C.TASK.TOP_DOWN_MAP.FOG_OF_WAR = CN() +_C.TASK.TOP_DOWN_MAP.FOG_OF_WAR.DRAW = True +_C.TASK.TOP_DOWN_MAP.FOG_OF_WAR.VISIBILITY_DIST = 5.0 +_C.TASK.TOP_DOWN_MAP.FOG_OF_WAR.FOV = 90 +_C.TASK.TOP_DOWN_MAP.DRAW_VIEW_POINTS = True +_C.TASK.TOP_DOWN_MAP.DRAW_GOAL_POSITIONS = True +# Axes aligned bounding boxes +_C.TASK.TOP_DOWN_MAP.DRAW_GOAL_AABBS = True +# ----------------------------------------------------------------------------- +# COLLISIONS MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.COLLISIONS = CN() +_C.TASK.COLLISIONS.TYPE = "Collisions" +# ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- +# # EQA TASK +# ----------------------------------------------------------------------------- +_C.TASK.ACTIONS.ANSWER = CN() +_C.TASK.ACTIONS.ANSWER.TYPE = "AnswerAction" +# # EQA TASK QUESTION SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.QUESTION_SENSOR = CN() +_C.TASK.QUESTION_SENSOR.TYPE = "QuestionSensor" +# ----------------------------------------------------------------------------- +# # EQA TASK CORRECT_ANSWER measure for training +# ----------------------------------------------------------------------------- +_C.TASK.CORRECT_ANSWER = CN() +_C.TASK.CORRECT_ANSWER.TYPE = "CorrectAnswer" +# ----------------------------------------------------------------------------- +# # EQA TASK ANSWER SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.EPISODE_INFO = CN() +_C.TASK.EPISODE_INFO.TYPE = "EpisodeInfo" +# ----------------------------------------------------------------------------- +# # VLN TASK INSTRUCTION SENSOR +# ----------------------------------------------------------------------------- +_C.TASK.INSTRUCTION_SENSOR = CN() +_C.TASK.INSTRUCTION_SENSOR.TYPE = "InstructionSensor" +_C.TASK.INSTRUCTION_SENSOR_UUID = "instruction" +# ----------------------------------------------------------------------------- +# # DISTANCE_TO_GOAL MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.DISTANCE_TO_GOAL = CN() +_C.TASK.DISTANCE_TO_GOAL.TYPE = "DistanceToGoal" +_C.TASK.DISTANCE_TO_GOAL.DISTANCE_TO = "POINT" +# ----------------------------------------------------------------------------- +# # ANSWER_ACCURACY MEASUREMENT +# ----------------------------------------------------------------------------- +_C.TASK.ANSWER_ACCURACY = CN() +_C.TASK.ANSWER_ACCURACY.TYPE = "AnswerAccuracy" +# ----------------------------------------------------------------------------- +# SIMULATOR +# ----------------------------------------------------------------------------- +_C.SIMULATOR = CN() +_C.SIMULATOR.TYPE = "Sim-v0" +_C.SIMULATOR.ACTION_SPACE_CONFIG = "v0" +_C.SIMULATOR.FORWARD_STEP_SIZE = 0.25 # in metres +_C.SIMULATOR.SCENE = ( + "data/scene_datasets/habitat-test-scenes/van-gogh-room.glb" +) +_C.SIMULATOR.SEED = _C.SEED +_C.SIMULATOR.TURN_ANGLE = 10 # angle to rotate left or right in degrees +_C.SIMULATOR.TILT_ANGLE = 15 # angle to tilt the camera up or down in degrees +_C.SIMULATOR.DEFAULT_AGENT_ID = 0 +# ----------------------------------------------------------------------------- +# SIMULATOR SENSORS +# ----------------------------------------------------------------------------- +SIMULATOR_SENSOR = CN() +SIMULATOR_SENSOR.HEIGHT = 480 +SIMULATOR_SENSOR.WIDTH = 640 +SIMULATOR_SENSOR.HFOV = 90 # horizontal field of view in degrees +SIMULATOR_SENSOR.POSITION = [0, 1.25, 0] +SIMULATOR_SENSOR.ORIENTATION = [0.0, 0.0, 0.0] # Euler's angles +# ----------------------------------------------------------------------------- +# RGB SENSOR +# ----------------------------------------------------------------------------- +_C.SIMULATOR.RGB_SENSOR = SIMULATOR_SENSOR.clone() +_C.SIMULATOR.RGB_SENSOR.TYPE = "HabitatSimRGBSensor" +# ----------------------------------------------------------------------------- +# DEPTH SENSOR +# ----------------------------------------------------------------------------- +_C.SIMULATOR.DEPTH_SENSOR = SIMULATOR_SENSOR.clone() +_C.SIMULATOR.DEPTH_SENSOR.TYPE = "HabitatSimDepthSensor" +_C.SIMULATOR.DEPTH_SENSOR.MIN_DEPTH = 0.0 +_C.SIMULATOR.DEPTH_SENSOR.MAX_DEPTH = 10.0 +_C.SIMULATOR.DEPTH_SENSOR.NORMALIZE_DEPTH = True +# ----------------------------------------------------------------------------- +# SEMANTIC SENSOR +# ----------------------------------------------------------------------------- +_C.SIMULATOR.SEMANTIC_SENSOR = SIMULATOR_SENSOR.clone() +_C.SIMULATOR.SEMANTIC_SENSOR.TYPE = "HabitatSimSemanticSensor" +# ----------------------------------------------------------------------------- +# AGENT +# ----------------------------------------------------------------------------- +_C.SIMULATOR.AGENT_0 = CN() +_C.SIMULATOR.AGENT_0.HEIGHT = 1.5 +_C.SIMULATOR.AGENT_0.RADIUS = 0.1 +_C.SIMULATOR.AGENT_0.MASS = 32.0 +_C.SIMULATOR.AGENT_0.LINEAR_ACCELERATION = 20.0 +_C.SIMULATOR.AGENT_0.ANGULAR_ACCELERATION = 4 * 3.14 +_C.SIMULATOR.AGENT_0.LINEAR_FRICTION = 0.5 +_C.SIMULATOR.AGENT_0.ANGULAR_FRICTION = 1.0 +_C.SIMULATOR.AGENT_0.COEFFICIENT_OF_RESTITUTION = 0.0 +_C.SIMULATOR.AGENT_0.SENSORS = ["RGB_SENSOR"] +_C.SIMULATOR.AGENT_0.IS_SET_START_STATE = False +_C.SIMULATOR.AGENT_0.START_POSITION = [0, 0, 0] +_C.SIMULATOR.AGENT_0.START_ROTATION = [0, 0, 0, 1] +_C.SIMULATOR.AGENTS = ["AGENT_0"] +# ----------------------------------------------------------------------------- +# SIMULATOR HABITAT_SIM_V0 +# ----------------------------------------------------------------------------- +_C.SIMULATOR.HABITAT_SIM_V0 = CN() +_C.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = 0 +# Use Habitat-Sim's GPU->GPU copy mode to return rendering results +# in PyTorch tensors. Requires Habitat-Sim to be built +# with --with-cuda +# This will generally imply sharing CUDA tensors between processes. +# Read here: https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors +# for the caveats that results in +_C.SIMULATOR.HABITAT_SIM_V0.GPU_GPU = False +# Whether or not the agent slides on collisions +_C.SIMULATOR.HABITAT_SIM_V0.ALLOW_SLIDING = True +_C.SIMULATOR.HABITAT_SIM_V0.ENABLE_PHYSICS = False +_C.SIMULATOR.HABITAT_SIM_V0.PHYSICS_CONFIG_FILE = ( + "./data/default.physics_config.json" +) +# ----------------------------------------------------------------------------- +# PYROBOT +# ----------------------------------------------------------------------------- +_C.PYROBOT = CN() +_C.PYROBOT.ROBOTS = ["locobot"] # types of robots supported +_C.PYROBOT.ROBOT = "locobot" +_C.PYROBOT.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR", "BUMP_SENSOR"] +_C.PYROBOT.BASE_CONTROLLER = "proportional" +_C.PYROBOT.BASE_PLANNER = "none" +# ----------------------------------------------------------------------------- +# SENSORS +# ----------------------------------------------------------------------------- +PYROBOT_VISUAL_SENSOR = CN() +PYROBOT_VISUAL_SENSOR.HEIGHT = 480 +PYROBOT_VISUAL_SENSOR.WIDTH = 640 +# ----------------------------------------------------------------------------- +# RGB SENSOR +# ----------------------------------------------------------------------------- +_C.PYROBOT.RGB_SENSOR = PYROBOT_VISUAL_SENSOR.clone() +_C.PYROBOT.RGB_SENSOR.TYPE = "PyRobotRGBSensor" +_C.PYROBOT.RGB_SENSOR.CENTER_CROP = False +# ----------------------------------------------------------------------------- +# DEPTH SENSOR +# ----------------------------------------------------------------------------- +_C.PYROBOT.DEPTH_SENSOR = PYROBOT_VISUAL_SENSOR.clone() +_C.PYROBOT.DEPTH_SENSOR.TYPE = "PyRobotDepthSensor" +_C.PYROBOT.DEPTH_SENSOR.MIN_DEPTH = 0.0 +_C.PYROBOT.DEPTH_SENSOR.MAX_DEPTH = 5.0 +_C.PYROBOT.DEPTH_SENSOR.NORMALIZE_DEPTH = True +_C.PYROBOT.DEPTH_SENSOR.CENTER_CROP = False +# ----------------------------------------------------------------------------- +# BUMP SENSOR +# ----------------------------------------------------------------------------- +_C.PYROBOT.BUMP_SENSOR = CN() +_C.PYROBOT.BUMP_SENSOR.TYPE = "PyRobotBumpSensor" +# ----------------------------------------------------------------------------- +# ACTIONS LOCOBOT +# ----------------------------------------------------------------------------- +_C.PYROBOT.LOCOBOT = CN() +_C.PYROBOT.LOCOBOT.ACTIONS = ["BASE_ACTIONS", "CAMERA_ACTIONS"] +_C.PYROBOT.LOCOBOT.BASE_ACTIONS = ["go_to_relative", "go_to_absolute"] +_C.PYROBOT.LOCOBOT.CAMERA_ACTIONS = ["set_pan", "set_tilt", "set_pan_tilt"] +# TODO(akadian): add support for Arm actions +# ----------------------------------------------------------------------------- +# DATASET +# ----------------------------------------------------------------------------- +_C.DATASET = CN() +_C.DATASET.TYPE = "PointNav-v1" +_C.DATASET.SPLIT = "train" +_C.DATASET.SCENES_DIR = "data/scene_datasets" +_C.DATASET.CONTENT_SCENES = ["*"] +_C.DATASET.DATA_PATH = ( + "data/datasets/pointnav/habitat-test-scenes/v1/{split}/{split}.json.gz" +) + +# ----------------------------------------------------------------------------- + + +def get_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None, +) -> CN: + r"""Create a unified config with default values overwritten by values from + :p:`config_paths` and overwritten by options from :p:`opts`. + + :param config_paths: List of config paths or string that contains comma + separated list of config paths. + :param opts: Config options (keys, values) in a list (e.g., passed from + command line into the config. For example, + :py:`opts = ['FOO.BAR', 0.5]`. Argument can be used for parameter + sweeping or quick tests. + """ + config = _C.clone() + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + + if opts: + config.merge_from_list(opts) + + config.freeze() + return config diff --git a/habitat-lab-dialog/habitat/core/__init__.py b/habitat-lab-dialog/habitat/core/__init__.py new file mode 100644 index 0000000..240697e --- /dev/null +++ b/habitat-lab-dialog/habitat/core/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/habitat-lab-dialog/habitat/core/agent.py b/habitat-lab-dialog/habitat/core/agent.py new file mode 100644 index 0000000..14f87a8 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/agent.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +r"""Base implementation of agent inside habitat. To build agents inside habitat +the user should subclass ``habitat.Agent`` and implement the ``act()`` +and ``reset()`` methods. +""" + +from typing import Any, Dict, Union + +from habitat.core.simulator import Observations + + +class Agent: + r"""Abstract class for defining agents which act inside :ref:`core.env.Env`. + + This abstract class standardizes agents to allow seamless benchmarking. + """ + + def reset(self) -> None: + r"""Called before starting a new episode in environment.""" + raise NotImplementedError + + def act( + self, observations: "Observations" + ) -> Union[int, str, Dict[str, Any]]: + r"""Called to produce an action to perform in an environment. + + :param observations: observations coming in from environment to be + used by agent to decide action. + :return: action to be taken inside the environment and optional action + arguments. + """ + raise NotImplementedError diff --git a/habitat-lab-dialog/habitat/core/benchmark.py b/habitat-lab-dialog/habitat/core/benchmark.py new file mode 100644 index 0000000..284177b --- /dev/null +++ b/habitat-lab-dialog/habitat/core/benchmark.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +r"""Implements evaluation of ``habitat.Agent`` inside ``habitat.Env``. +``habitat.Benchmark`` creates a ``habitat.Env`` which is specified through +the ``config_env`` parameter in constructor. The evaluation is task agnostic +and is implemented through metrics defined for ``habitat.EmbodiedTask``. +""" + +import os +from collections import defaultdict +from typing import Dict, Optional + +from habitat.config.default import get_config +from habitat.core.agent import Agent +from habitat.core.env import Env + + +class Benchmark: + r"""Benchmark for evaluating agents in environments.""" + + def __init__( + self, config_paths: Optional[str] = None, eval_remote: bool = False + ) -> None: + r""".. + + :param config_paths: file to be used for creating the environment + :param eval_remote: boolean indicating whether evaluation should be run remotely or locally + """ + config_env = get_config(config_paths) + self._eval_remote = eval_remote + + if self._eval_remote is True: + self._env = None + else: + self._env = Env(config=config_env) + + def remote_evaluate( + self, agent: "Agent", num_episodes: Optional[int] = None + ): + # The modules imported below are specific to habitat-challenge remote evaluation. + # These modules are not part of the habitat-lab repository. + import pickle + import time + + import evalai_environment_habitat # noqa: F401 + import evaluation_pb2 + import evaluation_pb2_grpc + import grpc + + time.sleep(60) + + def pack_for_grpc(entity): + return pickle.dumps(entity) + + def unpack_for_grpc(entity): + return pickle.loads(entity) + + def remote_ep_over(stub): + res_env = unpack_for_grpc( + stub.episode_over(evaluation_pb2.Package()).SerializedEntity + ) + return res_env["episode_over"] + + env_address_port = os.environ.get("EVALENV_ADDPORT", "localhost:8085") + channel = grpc.insecure_channel(env_address_port) + stub = evaluation_pb2_grpc.EnvironmentStub(channel) + + base_num_episodes = unpack_for_grpc( + stub.num_episodes(evaluation_pb2.Package()).SerializedEntity + ) + num_episodes = base_num_episodes["num_episodes"] + + agg_metrics: Dict = defaultdict(float) + + count_episodes = 0 + + while count_episodes < num_episodes: + agent.reset() + res_env = unpack_for_grpc( + stub.reset(evaluation_pb2.Package()).SerializedEntity + ) + + while not remote_ep_over(stub): + obs = res_env["observations"] + action = agent.act(obs) + + res_env = unpack_for_grpc( + stub.act_on_environment( + evaluation_pb2.Package( + SerializedEntity=pack_for_grpc(action) + ) + ).SerializedEntity + ) + + metrics = unpack_for_grpc( + stub.get_metrics( + evaluation_pb2.Package( + SerializedEntity=pack_for_grpc(action) + ) + ).SerializedEntity + ) + + for m, v in metrics["metrics"].items(): + agg_metrics[m] += v + count_episodes += 1 + + avg_metrics = {k: v / count_episodes for k, v in agg_metrics.items()} + + stub.evalai_update_submission(evaluation_pb2.Package()) + + return avg_metrics + + def local_evaluate( + self, agent: "Agent", num_episodes: Optional[int] = None + ) -> Dict[str, float]: + if num_episodes is None: + num_episodes = len(self._env.episodes) + else: + assert num_episodes <= len(self._env.episodes), ( + "num_episodes({}) is larger than number of episodes " + "in environment ({})".format( + num_episodes, len(self._env.episodes) + ) + ) + + assert num_episodes > 0, "num_episodes should be greater than 0" + + agg_metrics: Dict = defaultdict(float) + + count_episodes = 0 + while count_episodes < num_episodes: + agent.reset() + observations = self._env.reset() + + while not self._env.episode_over: + action = agent.act(observations) + observations = self._env.step(action) + + metrics = self._env.get_metrics() + for m, v in metrics.items(): + agg_metrics[m] += v + count_episodes += 1 + + avg_metrics = {k: v / count_episodes for k, v in agg_metrics.items()} + + return avg_metrics + + def evaluate( + self, agent: "Agent", num_episodes: Optional[int] = None + ) -> Dict[str, float]: + r""".. + + :param agent: agent to be evaluated in environment. + :param num_episodes: count of number of episodes for which the + evaluation should be run. + :return: dict containing metrics tracked by environment. + """ + + if self._eval_remote is True: + return self.remote_evaluate(agent, num_episodes) + else: + return self.local_evaluate(agent, num_episodes) diff --git a/habitat-lab-dialog/habitat/core/challenge.py b/habitat-lab-dialog/habitat/core/challenge.py new file mode 100644 index 0000000..b879f90 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/challenge.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os + +from habitat.core.benchmark import Benchmark +from habitat.core.logging import logger + + +class Challenge(Benchmark): + def __init__(self, eval_remote=False): + config_paths = os.environ["CHALLENGE_CONFIG_FILE"] + super().__init__(config_paths, eval_remote=eval_remote) + + def submit(self, agent): + metrics = super().evaluate(agent) + for k, v in metrics.items(): + logger.info("{}: {}".format(k, v)) diff --git a/habitat-lab-dialog/habitat/core/dataset.py b/habitat-lab-dialog/habitat/core/dataset.py new file mode 100644 index 0000000..b5508c8 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/dataset.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +r"""Implements dataset functionality to be used ``habitat.EmbodiedTask``. +``habitat.core.dataset`` abstracts over a collection of +``habitat.core.Episode``. Each episode consists of a single instantiation +of a ``habitat.Agent`` inside ``habitat.Env``. +""" +import copy +import json +import os +import random +from itertools import groupby +from typing import ( + Any, + Callable, + Dict, + Generic, + Iterator, + List, + Optional, + Sequence, + TypeVar, + Union, +) + +import attr +import numpy as np +from numpy import ndarray + +from habitat.config import Config +from habitat.core.utils import not_none_validator + +ALL_SCENES_MASK = "*" + + +@attr.s(auto_attribs=True, kw_only=True) +class Episode: + r"""Base class for episode specification that includes initial position and + rotation of agent, scene id, episode. + + :property episode_id: id of episode in the dataset, usually episode number. + :property scene_id: id of scene in dataset. + :property start_position: list of length 3 for cartesian coordinates + :py:`(x, y, z)`. + :property start_rotation: list of length 4 for (x, y, z, w) elements + of unit quaternion (versor) representing 3D agent orientation + (https://en.wikipedia.org/wiki/Versor). The rotation specifying the + agent's orientation is relative to the world coordinate axes. + + This information is provided by a :ref:`Dataset` instance. + """ + + episode_id: str = attr.ib(default=None, validator=not_none_validator) + scene_id: str = attr.ib(default=None, validator=not_none_validator) + start_position: List[float] = attr.ib( + default=None, validator=not_none_validator + ) + start_rotation: List[float] = attr.ib( + default=None, validator=not_none_validator + ) + info: Optional[Dict[str, Any]] = None + _shortest_path_cache: Any = attr.ib(init=False, default=None) + + def __getstate__(self): + return { + k: v + for k, v in self.__dict__.items() + if k not in {"_shortest_path_cache"} + } + + def __setstate__(self, state): + self.__dict__.update(state) + self.__dict__["_shortest_path_cache"] = None + + +T = TypeVar("T", bound=Episode) + + +class Dataset(Generic[T]): + r"""Base class for dataset specification.""" + episodes: List[T] + + @staticmethod + def scene_from_scene_path(scene_path: str) -> str: + r"""Helper method to get the scene name from an episode. + + :param scene_path: The path to the scene, assumes this is formatted + ``/path/to/.`` + + :return: from the path + """ + return os.path.splitext(os.path.basename(scene_path))[0] + + @classmethod + def get_scenes_to_load(cls, config: Config) -> List[str]: + r"""Returns a list of scene names that would be loaded with this dataset. + + Useful for determing what scenes to split up among different workers. + + :param config: The config for the dataset + + :return: A list of scene names that would be loaded with the dataset + """ + assert cls.check_config_paths_exist(config) # type: ignore[attr-defined] + dataset = cls(config) # type: ignore[call-arg] + return list(map(cls.scene_from_scene_path, dataset.scene_ids)) + + @classmethod + def build_content_scenes_filter(cls, config) -> Callable[[T], bool]: + r"""Returns a filter function that takes an episode and returns True if that + episode is valid under the CONTENT_SCENES feild of the provided config + """ + scenes_to_load = set(config.CONTENT_SCENES) + + def _filter(ep: T) -> bool: + return ( + ALL_SCENES_MASK in scenes_to_load + or cls.scene_from_scene_path(ep.scene_id) in scenes_to_load + ) + + return _filter + + @property + def num_episodes(self) -> int: + r"""number of episodes in the dataset""" + return len(self.episodes) + + @property + def scene_ids(self) -> List[str]: + r"""unique scene ids present in the dataset.""" + return sorted({episode.scene_id for episode in self.episodes}) + + def get_scene_episodes(self, scene_id: str) -> List[T]: + r""".. + + :param scene_id: id of scene in scene dataset. + :return: list of episodes for the :p:`scene_id`. + """ + return list( + filter(lambda x: x.scene_id == scene_id, iter(self.episodes)) + ) + + def get_episodes(self, indexes: List[int]) -> List[T]: + r""".. + + :param indexes: episode indices in dataset. + :return: list of episodes corresponding to indexes. + """ + return [self.episodes[episode_id] for episode_id in indexes] + + def get_episode_iterator(self, *args: Any, **kwargs: Any) -> Iterator: + r"""Gets episode iterator with options. Options are specified in + :ref:`EpisodeIterator` documentation. + + :param args: positional args for iterator constructor + :param kwargs: keyword args for iterator constructor + :return: episode iterator with specified behavior + + To further customize iterator behavior for your :ref:`Dataset` + subclass, create a customized iterator class like + :ref:`EpisodeIterator` and override this method. + """ + return EpisodeIterator(self.episodes, *args, **kwargs) + + def to_json(self) -> str: + class DatasetJSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + + return ( + obj.__getstate__() + if hasattr(obj, "__getstate__") + else obj.__dict__ + ) + + result = DatasetJSONEncoder().encode(self) + return result + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None + ) -> None: + r"""Creates dataset from :p:`json_str`. + + :param json_str: JSON string containing episodes information. + :param scenes_dir: directory containing graphical assets relevant + for episodes present in :p:`json_str`. + + Directory containing relevant graphical assets of scenes is passed + through :p:`scenes_dir`. + """ + raise NotImplementedError + + def filter_episodes(self, filter_fn: Callable[[T], bool]) -> "Dataset": + r"""Returns a new dataset with only the filtered episodes from the + original dataset. + + :param filter_fn: function used to filter the episodes. + :return: the new dataset. + """ + new_episodes = [] + for episode in self.episodes: + if filter_fn(episode): + new_episodes.append(episode) + new_dataset = copy.copy(self) + new_dataset.episodes = new_episodes + return new_dataset + + def get_splits( + self, + num_splits: int, + episodes_per_split: Optional[int] = None, + remove_unused_episodes: bool = False, + collate_scene_ids: bool = True, + sort_by_episode_id: bool = False, + allow_uneven_splits: bool = False, + ) -> List["Dataset"]: + r"""Returns a list of new datasets, each with a subset of the original + episodes. + + :param num_splits: the number of splits to create. + :param episodes_per_split: if provided, each split will have up to this + many episodes. If it is not provided, each dataset will have + :py:`len(original_dataset.episodes) // num_splits` episodes. If + max_episodes_per_split is provided and is larger than this value, + it will be capped to this value. + :param remove_unused_episodes: once the splits are created, the extra + episodes will be destroyed from the original dataset. This saves + memory for large datasets. + :param collate_scene_ids: if true, episodes with the same scene id are + next to each other. This saves on overhead of switching between + scenes, but means multiple sequential episodes will be related to + each other because they will be in the same scene. + :param sort_by_episode_id: if true, sequences are sorted by their + episode ID in the returned splits. + :param allow_uneven_splits: if true, the last splits can be shorter + than the others. This is especially useful for splitting over + validation/test datasets in order to make sure that all episodes + are copied but none are duplicated. + :return: a list of new datasets, each with their own subset of + episodes. + + All splits will have the same number of episodes, but no episodes will + be duplicated. + """ + if self.num_episodes < num_splits: + raise ValueError( + "Not enough episodes to create those many splits." + ) + + if episodes_per_split is not None: + if allow_uneven_splits: + raise ValueError( + "You probably don't want to specify allow_uneven_splits" + " and episodes_per_split." + ) + + if num_splits * episodes_per_split > self.num_episodes: + raise ValueError( + "Not enough episodes to create those many splits." + ) + + new_datasets = [] + + if episodes_per_split is not None: + stride = episodes_per_split + else: + stride = self.num_episodes // num_splits + split_lengths = [stride] * num_splits + + if allow_uneven_splits: + episodes_left = self.num_episodes - stride * num_splits + split_lengths[:episodes_left] = [stride + 1] * episodes_left + assert sum(split_lengths) == self.num_episodes + + num_episodes = sum(split_lengths) + + rand_items = np.random.choice( + self.num_episodes, num_episodes, replace=False + ) + if collate_scene_ids: + scene_ids: Dict[str, List[int]] = {} + for rand_ind in rand_items: + scene = self.episodes[rand_ind].scene_id + if scene not in scene_ids: + scene_ids[scene] = [] + scene_ids[scene].append(rand_ind) + rand_items = [] + list(map(rand_items.extend, scene_ids.values())) + ep_ind = 0 + new_episodes = [] + for nn in range(num_splits): + new_dataset = copy.copy(self) # Creates a shallow copy + new_dataset.episodes = [] + new_datasets.append(new_dataset) + for _ii in range(split_lengths[nn]): + new_dataset.episodes.append(self.episodes[rand_items[ep_ind]]) + ep_ind += 1 + if sort_by_episode_id: + new_dataset.episodes.sort(key=lambda ep: ep.episode_id) + new_episodes.extend(new_dataset.episodes) + if remove_unused_episodes: + self.episodes = new_episodes + return new_datasets + + +class EpisodeIterator(Iterator): + r"""Episode Iterator class that gives options for how a list of episodes + should be iterated. + + Some of those options are desirable for the internal simulator to get + higher performance. More context: simulator suffers overhead when switching + between scenes, therefore episodes of the same scene should be loaded + consecutively. However, if too many consecutive episodes from same scene + are feed into RL model, the model will risk to overfit that scene. + Therefore it's better to load same scene consecutively and switch once a + number threshold is reached. + + Currently supports the following features: + + Cycling: + when all episodes are iterated, cycle back to start instead of throwing + StopIteration. + Cycling with shuffle: + when cycling back, shuffle episodes groups grouped by scene. + Group by scene: + episodes of same scene will be grouped and loaded consecutively. + Set max scene repeat: + set a number threshold on how many episodes from the same scene can be + loaded consecutively. + Sample episodes: + sample the specified number of episodes. + """ + + def __init__( + self, + episodes: Sequence[T], + cycle: bool = True, + shuffle: bool = False, + group_by_scene: bool = True, + max_scene_repeat_episodes: int = -1, + max_scene_repeat_steps: int = -1, + num_episode_sample: int = -1, + step_repetition_range: float = 0.2, + seed: int = None, + ) -> None: + r""".. + + :param episodes: list of episodes. + :param cycle: if :py:`True`, cycle back to first episodes when + StopIteration. + :param shuffle: if :py:`True`, shuffle scene groups when cycle. No + effect if cycle is set to :py:`False`. Will shuffle grouped scenes + if :p:`group_by_scene` is :py:`True`. + :param group_by_scene: if :py:`True`, group episodes from same scene. + :param max_scene_repeat_episodes: threshold of how many episodes from the same + scene can be loaded consecutively. :py:`-1` for no limit + :param max_scene_repeat_steps: threshold of how many steps from the same + scene can be taken consecutively. :py:`-1` for no limit + :param num_episode_sample: number of episodes to be sampled. :py:`-1` + for no sampling. + :param step_repetition_range: The maximum number of steps within each scene is + uniformly drawn from + [1 - step_repeat_range, 1 + step_repeat_range] * max_scene_repeat_steps + on each scene switch. This stops all workers from swapping scenes at + the same time + """ + if seed: + random.seed(seed) + np.random.seed(seed) + + # sample episodes + if num_episode_sample >= 0: + episodes = np.random.choice( + episodes, num_episode_sample, replace=False + ) + + if not isinstance(episodes, list): + episodes = list(episodes) + + self.episodes = episodes + self.cycle = cycle + self.group_by_scene = group_by_scene + self.shuffle = shuffle + + if shuffle: + random.shuffle(self.episodes) + + if group_by_scene: + self.episodes = self._group_scenes(self.episodes) + + self.max_scene_repetition_episodes = max_scene_repeat_episodes + self.max_scene_repetition_steps = max_scene_repeat_steps + + self._rep_count = -1 # 0 corresponds to first episode already returned + self._step_count = 0 + self._prev_scene_id: Optional[str] = None + + self._iterator = iter(self.episodes) + + self.step_repetition_range = step_repetition_range + self._set_shuffle_intervals() + + def __iter__(self) -> "EpisodeIterator": + return self + + def __next__(self) -> Episode: + r"""The main logic for handling how episodes will be iterated. + + :return: next episode. + """ + self._forced_scene_switch_if() + + next_episode = next(self._iterator, None) + if next_episode is None: + if not self.cycle: + raise StopIteration + + self._iterator = iter(self.episodes) + + if self.shuffle: + self._shuffle() + + next_episode = next(self._iterator) + + if ( + self._prev_scene_id != next_episode.scene_id + and self._prev_scene_id is not None + ): + self._rep_count = 0 + self._step_count = 0 + + self._prev_scene_id = next_episode.scene_id + return next_episode + + def _forced_scene_switch(self) -> None: + r"""Internal method to switch the scene. Moves remaining episodes + from current scene to the end and switch to next scene episodes. + """ + grouped_episodes = [ + list(g) + for k, g in groupby(self._iterator, key=lambda x: x.scene_id) + ] + + if len(grouped_episodes) > 1: + # Ensure we swap by moving the current group to the end + grouped_episodes = grouped_episodes[1:] + grouped_episodes[0:1] + + self._iterator = iter(sum(grouped_episodes, [])) + + def _shuffle(self) -> None: + r"""Internal method that shuffles the remaining episodes. + If self.group_by_scene is true, then shuffle groups of scenes. + """ + assert self.shuffle + episodes = list(self._iterator) + + random.shuffle(episodes) + + if self.group_by_scene: + episodes = self._group_scenes(episodes) + + self._iterator = iter(episodes) + + def _group_scenes( + self, episodes: Union[Sequence[Episode], List[Episode], ndarray] + ) -> List[T]: + r"""Internal method that groups episodes by scene + Groups will be ordered by the order the first episode of a given + scene is in the list of episodes + + So if the episodes list shuffled before calling this method, + the scenes will be in a random order + """ + assert self.group_by_scene + + scene_sort_keys: Dict[str, int] = {} + for e in episodes: + if e.scene_id not in scene_sort_keys: + scene_sort_keys[e.scene_id] = len(scene_sort_keys) + + return sorted(episodes, key=lambda e: scene_sort_keys[e.scene_id]) # type: ignore[arg-type] + + def step_taken(self) -> None: + self._step_count += 1 + + @staticmethod + def _randomize_value(value: int, value_range: float) -> int: + return random.randint( + int(value * (1 - value_range)), int(value * (1 + value_range)) + ) + + def _set_shuffle_intervals(self) -> None: + if self.max_scene_repetition_episodes > 0: + self._max_rep_episode = self.max_scene_repetition_episodes + else: + self._max_rep_episode = None + + if self.max_scene_repetition_steps > 0: + self._max_rep_step = self._randomize_value( + self.max_scene_repetition_steps, self.step_repetition_range + ) + else: + self._max_rep_step = None + + def _forced_scene_switch_if(self) -> None: + do_switch = False + self._rep_count += 1 + + # Shuffle if a scene has been selected more than _max_rep_episode times in a row + if ( + self._max_rep_episode is not None + and self._rep_count >= self._max_rep_episode + ): + do_switch = True + + # Shuffle if a scene has been used for more than _max_rep_step steps in a row + if ( + self._max_rep_step is not None + and self._step_count >= self._max_rep_step + ): + do_switch = True + + if do_switch: + self._forced_scene_switch() + self._set_shuffle_intervals() diff --git a/habitat-lab-dialog/habitat/core/embodied_task.py b/habitat-lab-dialog/habitat/core/embodied_task.py new file mode 100644 index 0000000..564589e --- /dev/null +++ b/habitat-lab-dialog/habitat/core/embodied_task.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +r"""Implements tasks and measurements needed for training and benchmarking of +``habitat.Agent`` inside ``habitat.Env``. +""" + +from collections import OrderedDict +from typing import Any, Dict, Iterable, List, Optional, Union + +import numpy as np + +from habitat.config import Config +from habitat.core.dataset import Dataset, Episode +from habitat.core.simulator import Observations, SensorSuite, Simulator +from habitat.core.spaces import ActionSpace, EmptySpace, Space + + +class Action: + r""" + An action that can be performed by an agent solving a task in environment. + For example for navigation task action classes will be: + ``MoveForwardAction, TurnLeftAction, TurnRightAction``. The action can + use ``Task`` members to pass a state to another action, as well as keep + own state and reset when new episode starts. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + return + + def reset(self, *args: Any, **kwargs: Any) -> None: + r"""Reset method is called from ``Env`` on each reset for each new + episode. Goal of the method is to reset ``Action``'s state for each + episode. + """ + raise NotImplementedError + + def step(self, *args: Any, **kwargs: Any) -> Observations: + r"""Step method is called from ``Env`` on each ``step``. Can call + simulator or task method, change task's state. + + :param kwargs: optional parameters for the action, like distance/force. + :return: observations after taking action in the task, including ones + coming from a simulator. + """ + raise NotImplementedError + + @property + def action_space(self) -> Space: + r"""a current Action's action space.""" + raise NotImplementedError + + +class SimulatorTaskAction(Action): + r""" + An ``EmbodiedTask`` action that is wrapping simulator action. + """ + + def __init__( + self, *args: Any, config: Config, sim: Simulator, **kwargs: Any + ) -> None: + self._config = config + self._sim = sim + + @property + def action_space(self): + return EmptySpace() + + def reset(self, *args: Any, **kwargs: Any) -> None: + return None + + def step(self, *args: Any, **kwargs: Any) -> Observations: + r"""Step method is called from ``Env`` on each ``step``.""" + raise NotImplementedError + + +class Measure: + r"""Represents a measure that provides measurement on top of environment + and task. + + :data uuid: universally unique id. + :data _metric: metric for the :ref:`Measure`, this has to be updated with + each :ref:`step() ` call on :ref:`env.Env`. + + This can be used for tracking statistics when running experiments. The + user of this class needs to implement the :ref:`reset_metric()` and + :ref:`update_metric()` method and the user is also required to set the + :ref:`uuid ` and :ref:`_metric` attributes. + + .. (uuid is a builtin Python module, so just :ref:`uuid` would link there) + """ + + _metric: Any + uuid: str + + def __init__(self, *args: Any, **kwargs: Any) -> None: + self.uuid = self._get_uuid(*args, **kwargs) + self._metric = None + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + raise NotImplementedError + + def reset_metric(self, *args: Any, **kwargs: Any) -> None: + r"""Reset :ref:`_metric`, this method is called from :ref:`env.Env` on + each reset. + """ + raise NotImplementedError + + def update_metric(self, *args: Any, **kwargs: Any) -> None: + r"""Update :ref:`_metric`, this method is called from :ref:`env.Env` + on each :ref:`step() ` + """ + raise NotImplementedError + + def get_metric(self): + r""".. + + :return: the current metric for :ref:`Measure`. + """ + return self._metric + + +class Metrics(dict): + r"""Dictionary containing measurements.""" + + def __init__(self, measures: Dict[str, Measure]) -> None: + """Constructor + + :param measures: list of :ref:`Measure` whose metrics are fetched and + packaged. + """ + data = [ + (uuid, measure.get_metric()) for uuid, measure in measures.items() + ] + super().__init__(data) + + +class Measurements: + r"""Represents a set of Measures, with each :ref:`Measure` being + identified through a unique id. + """ + + measures: Dict[str, Measure] + + def __init__(self, measures: Iterable[Measure]) -> None: + """Constructor + + :param measures: list containing :ref:`Measure`, uuid of each + :ref:`Measure` must be unique. + """ + self.measures = OrderedDict() + for measure in measures: + assert ( + measure.uuid not in self.measures + ), "'{}' is duplicated measure uuid".format(measure.uuid) + self.measures[measure.uuid] = measure + + def reset_measures(self, *args: Any, **kwargs: Any) -> None: + for measure in self.measures.values(): + measure.reset_metric(*args, **kwargs) + + def update_measures(self, *args: Any, **kwargs: Any) -> None: + for measure in self.measures.values(): + measure.update_metric(*args, **kwargs) + + def get_metrics(self) -> Metrics: + r"""Collects measurement from all :ref:`Measure`\ s and returns it + packaged inside :ref:`Metrics`. + """ + return Metrics(self.measures) + + def _get_measure_index(self, measure_name): + return list(self.measures.keys()).index(measure_name) + + def check_measure_dependencies( + self, measure_name: str, dependencies: List[str] + ): + r"""Checks if dependencies measures are enabled and calculatethat the measure + :param measure_name: a name of the measure for which has dependencies. + :param dependencies: a list of a measure names that are required by + the measure. + :return: + """ + measure_index = self._get_measure_index(measure_name) + for dependency_measure in dependencies: + assert ( + dependency_measure in self.measures + ), f"""{measure_name} measure requires {dependency_measure} + listed in tje measures list in the config.""" + + for dependency_measure in dependencies: + assert measure_index > self._get_measure_index( + dependency_measure + ), f"""{measure_name} measure requires be listed after {dependency_measure} + in tje measures list in the config.""" + + +class EmbodiedTask: + r"""Base class for embodied tasks. ``EmbodiedTask`` holds definition of + a task that agent needs to solve: action space, observation space, + measures, simulator usage. ``EmbodiedTask`` has :ref:`reset` and + :ref:`step` methods that are called by ``Env``. ``EmbodiedTask`` is the + one of main dimensions for the framework extension. Once new embodied task + is introduced implementation of ``EmbodiedTask`` is a formal definition of + the task that opens opportunity for others to propose solutions and + include it into benchmark results. + + Args: + config: config for the task. + sim: reference to the simulator for calculating task observations. + dataset: reference to dataset for task instance level information. + + :data measurements: set of task measures. + :data sensor_suite: suite of task sensors. + """ + + _config: Any + _sim: Optional[Simulator] + _dataset: Optional[Dataset] + _is_episode_active: bool + measurements: Measurements + sensor_suite: SensorSuite + + def __init__( + self, config: Config, sim: Simulator, dataset: Optional[Dataset] = None + ) -> None: + from habitat.core.registry import registry + + self._config = config + self._sim = sim + self._dataset = dataset + + self.measurements = Measurements( + self._init_entities( + entity_names=config.MEASUREMENTS, + register_func=registry.get_measure, + entities_config=config, + ).values() + ) + + self.sensor_suite = SensorSuite( + self._init_entities( + entity_names=config.SENSORS, + register_func=registry.get_sensor, + entities_config=config, + ).values() + ) + + self.actions = self._init_entities( + entity_names=config.POSSIBLE_ACTIONS, + register_func=registry.get_task_action, + entities_config=self._config.ACTIONS, + ) + self._action_keys = list(self.actions.keys()) + + def _init_entities( + self, entity_names, register_func, entities_config=None + ) -> OrderedDict: + if entities_config is None: + entities_config = self._config + + entities = OrderedDict() + for entity_name in entity_names: + entity_cfg = getattr(entities_config, entity_name) + entity_type = register_func(entity_cfg.TYPE) + assert ( + entity_type is not None + ), f"invalid {entity_name} type {entity_cfg.TYPE}" + entities[entity_name] = entity_type( + sim=self._sim, + config=entity_cfg, + dataset=self._dataset, + task=self, + ) + return entities + + def reset(self, episode: Episode): + observations = self._sim.reset() + observations.update( + self.sensor_suite.get_observations( + observations=observations, episode=episode, task=self + ) + ) + + for action_instance in self.actions.values(): + action_instance.reset(episode=episode, task=self) + + return observations + + def step(self, action: Dict[str, Any], episode: Episode): + if "action_args" not in action or action["action_args"] is None: + action["action_args"] = {} + action_name = action["action"] + if isinstance(action_name, (int, np.integer)): + action_name = self.get_action_name(action_name) + assert ( + action_name in self.actions + ), f"Can't find '{action_name}' action in {self.actions.keys()}." + + task_action = self.actions[action_name] + observations = task_action.step(**action["action_args"], task=self) + observations.update( + self.sensor_suite.get_observations( + observations=observations, + episode=episode, + action=action, + task=self, + ) + ) + + self._is_episode_active = self._check_episode_is_active( + observations=observations, action=action, episode=episode + ) + + return observations + + def get_action_name(self, action_index: int): + if action_index >= len(self.actions): + raise ValueError(f"Action index '{action_index}' is out of range.") + return self._action_keys[action_index] + + @property + def action_space(self) -> Space: + return ActionSpace( + { + action_name: action_instance.action_space + for action_name, action_instance in self.actions.items() + } + ) + + def overwrite_sim_config( + self, sim_config: Config, episode: Episode + ) -> Config: + r"""Update config merging information from :p:`sim_config` and + :p:`episode`. + + :param sim_config: config for simulator. + :param episode: current episode. + """ + raise NotImplementedError + + def _check_episode_is_active( + self, + *args: Any, + action: Union[int, Dict[str, Any]], + episode: Episode, + **kwargs: Any, + ) -> bool: + raise NotImplementedError + + @property + def is_episode_active(self): + return self._is_episode_active + + def seed(self, seed: int) -> None: + return diff --git a/habitat-lab-dialog/habitat/core/env.py b/habitat-lab-dialog/habitat/core/env.py new file mode 100644 index 0000000..3a08027 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/env.py @@ -0,0 +1,425 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import random +import time +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union, cast + +import gym +import numba +import numpy as np +from gym import spaces + +from habitat.config import Config +from habitat.core.dataset import Dataset, Episode, EpisodeIterator +from habitat.core.embodied_task import EmbodiedTask, Metrics +from habitat.core.simulator import Observations, Simulator +from habitat.datasets import make_dataset +from habitat.sims import make_sim +from habitat.tasks import make_task +from habitat.utils import profiling_wrapper + + +class Env: + r"""Fundamental environment class for :ref:`habitat`. + + :data observation_space: ``SpaceDict`` object corresponding to sensor in + sim and task. + :data action_space: ``gym.space`` object corresponding to valid actions. + + All the information needed for working on embodied tasks with simulator + is abstracted inside :ref:`Env`. Acts as a base for other derived + environment classes. :ref:`Env` consists of three major components: + ``dataset`` (`episodes`), ``simulator`` (:ref:`sim`) and :ref:`task` and + connects all the three components together. + """ + + observation_space: spaces.Dict + action_space: spaces.Dict + _config: Config + _dataset: Optional[Dataset] + number_of_episodes: Optional[int] + _episodes: List[Episode] + _current_episode_index: Optional[int] + _current_episode: Optional[Episode] + _episode_iterator: Optional[Iterator] + _sim: Simulator + _task: EmbodiedTask + _max_episode_seconds: int + _max_episode_steps: int + _elapsed_steps: int + _episode_start_time: Optional[float] + _episode_over: bool + + def __init__( + self, config: Config, dataset: Optional[Dataset] = None + ) -> None: + """Constructor + + :param config: config for the environment. Should contain id for + simulator and ``task_name`` which are passed into ``make_sim`` and + ``make_task``. + :param dataset: reference to dataset for task instance level + information. Can be defined as :py:`None` in which case + ``_episodes`` should be populated from outside. + """ + + assert config.is_frozen(), ( + "Freeze the config before creating the " + "environment, use config.freeze()." + ) + self._config = config + self._dataset = dataset + self._current_episode_index = None + if self._dataset is None and config.DATASET.TYPE: + self._dataset = make_dataset( + id_dataset=config.DATASET.TYPE, config=config.DATASET + ) + self._episodes = ( + self._dataset.episodes + if self._dataset + else cast(List[Episode], []) + ) + self._current_episode = None + iter_option_dict = { + k.lower(): v + for k, v in config.ENVIRONMENT.ITERATOR_OPTIONS.items() + } + iter_option_dict["seed"] = config.SEED + self._episode_iterator = self._dataset.get_episode_iterator( + **iter_option_dict + ) + + # load the first scene if dataset is present + if self._dataset: + assert ( + len(self._dataset.episodes) > 0 + ), "dataset should have non-empty episodes list" + self._config.defrost() + self._config.SIMULATOR.SCENE = self._dataset.episodes[0].scene_id + self._config.freeze() + + self.number_of_episodes = len(self._dataset.episodes) + else: + self.number_of_episodes = None + + self._sim = make_sim( + id_sim=self._config.SIMULATOR.TYPE, config=self._config.SIMULATOR + ) + self._task = make_task( + self._config.TASK.TYPE, + config=self._config.TASK, + sim=self._sim, + dataset=self._dataset, + ) + self.observation_space = spaces.Dict( + { + **self._sim.sensor_suite.observation_spaces.spaces, + **self._task.sensor_suite.observation_spaces.spaces, + } + ) + self.action_space = self._task.action_space + self._max_episode_seconds = ( + self._config.ENVIRONMENT.MAX_EPISODE_SECONDS + ) + self._max_episode_steps = self._config.ENVIRONMENT.MAX_EPISODE_STEPS + self._elapsed_steps = 0 + self._episode_start_time: Optional[float] = None + self._episode_over = False + + @property + def current_episode(self) -> Episode: + assert self._current_episode is not None + return self._current_episode + + @current_episode.setter + def current_episode(self, episode: Episode) -> None: + self._current_episode = episode + + @property + def episode_iterator(self) -> Iterator: + return self._episode_iterator + + @episode_iterator.setter + def episode_iterator(self, new_iter: Iterator) -> None: + self._episode_iterator = new_iter + + @property + def episodes(self) -> List[Episode]: + return self._episodes + + @episodes.setter + def episodes(self, episodes: List[Episode]) -> None: + assert ( + len(episodes) > 0 + ), "Environment doesn't accept empty episodes list." + self._episodes = episodes + + @property + def sim(self) -> Simulator: + return self._sim + + @property + def episode_start_time(self) -> Optional[float]: + return self._episode_start_time + + @property + def episode_over(self) -> bool: + return self._episode_over + + @property + def task(self) -> EmbodiedTask: + return self._task + + @property + def _elapsed_seconds(self) -> float: + assert ( + self._episode_start_time + ), "Elapsed seconds requested before episode was started." + return time.time() - self._episode_start_time + + def get_metrics(self) -> Metrics: + return self._task.measurements.get_metrics() + + def _past_limit(self) -> bool: + if ( + self._max_episode_steps != 0 + and self._max_episode_steps <= self._elapsed_steps + ): + return True + elif ( + self._max_episode_seconds != 0 + and self._max_episode_seconds <= self._elapsed_seconds + ): + return True + return False + + def _reset_stats(self) -> None: + self._episode_start_time = time.time() + self._elapsed_steps = 0 + self._episode_over = False + + def reset(self) -> Observations: + r"""Resets the environments and returns the initial observations. + + :return: initial observations from the environment. + """ + self._reset_stats() + + assert len(self.episodes) > 0, "Episodes list is empty" + # Delete the shortest path cache of the current episode + # Caching it for the next time we see this episode isn't really worth + # it + if self._current_episode is not None: + self._current_episode._shortest_path_cache = None + + self._current_episode = next(self._episode_iterator) + self.reconfigure(self._config) + + observations = self.task.reset(episode=self.current_episode) + self._task.measurements.reset_measures( + episode=self.current_episode, task=self.task + ) + + return observations + + def _update_step_stats(self) -> None: + self._elapsed_steps += 1 + self._episode_over = not self._task.is_episode_active + if self._past_limit(): + self._episode_over = True + + if self.episode_iterator is not None and isinstance( + self.episode_iterator, EpisodeIterator + ): + self.episode_iterator.step_taken() + + def step( + self, action: Union[int, str, Dict[str, Any]], **kwargs + ) -> Observations: + r"""Perform an action in the environment and return observations. + + :param action: action (belonging to :ref:`action_space`) to be + performed inside the environment. Action is a name or index of + allowed task's action and action arguments (belonging to action's + :ref:`action_space`) to support parametrized and continuous + actions. + :return: observations after taking action in environment. + """ + + assert ( + self._episode_start_time is not None + ), "Cannot call step before calling reset" + assert ( + self._episode_over is False + ), "Episode over, call reset before calling step" + + # Support simpler interface as well + if isinstance(action, (str, int, np.integer)): + action = {"action": action} + + observations = self.task.step( + action=action, episode=self.current_episode + ) + + self._task.measurements.update_measures( + episode=self.current_episode, action=action, task=self.task + ) + + self._update_step_stats() + + return observations + + @staticmethod + @numba.njit + def _seed_numba(seed: int): + random.seed(seed) + np.random.seed(seed) + + def seed(self, seed: int) -> None: + random.seed(seed) + np.random.seed(seed) + self._seed_numba(seed) + self._sim.seed(seed) + self._task.seed(seed) + + def reconfigure(self, config: Config) -> None: + self._config = config + + self._config.defrost() + self._config.SIMULATOR = self._task.overwrite_sim_config( + self._config.SIMULATOR, self.current_episode + ) + self._config.freeze() + + self._sim.reconfigure(self._config.SIMULATOR) + + def render(self, mode="rgb") -> np.ndarray: + return self._sim.render(mode) + + def close(self) -> None: + self._sim.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + +class RLEnv(gym.Env): + r"""Reinforcement Learning (RL) environment class which subclasses ``gym.Env``. + + This is a wrapper over :ref:`Env` for RL users. To create custom RL + environments users should subclass `RLEnv` and define the following + methods: :ref:`get_reward_range()`, :ref:`get_reward()`, + :ref:`get_done()`, :ref:`get_info()`. + + As this is a subclass of ``gym.Env``, it implements `reset()` and + `step()`. + """ + + _env: Env + + def __init__( + self, config: Config, dataset: Optional[Dataset] = None + ) -> None: + """Constructor + + :param config: config to construct :ref:`Env` + :param dataset: dataset to construct :ref:`Env`. + """ + + self._env = Env(config, dataset) + self.observation_space = self._env.observation_space + self.action_space = self._env.action_space + self.number_of_episodes = self._env.number_of_episodes + self.reward_range = self.get_reward_range() + + @property + def habitat_env(self) -> Env: + return self._env + + @property + def episodes(self) -> List[Episode]: + return self._env.episodes + + @episodes.setter + def episodes(self, episodes: List[Episode]) -> None: + self._env.episodes = episodes + + @property + def current_episode(self) -> Episode: + return self._env.current_episode + + @profiling_wrapper.RangeContext("RLEnv.reset") + def reset(self) -> Observations: + return self._env.reset() + + def get_reward_range(self): + r"""Get min, max range of reward. + + :return: :py:`[min, max]` range of reward. + """ + raise NotImplementedError + + def get_reward(self, observations: Observations) -> Any: + r"""Returns reward after action has been performed. + + :param observations: observations from simulator and task. + :return: reward after performing the last action. + + This method is called inside the :ref:`step()` method. + """ + raise NotImplementedError + + def get_done(self, observations: Observations) -> bool: + r"""Returns boolean indicating whether episode is done after performing + the last action. + + :param observations: observations from simulator and task. + :return: done boolean after performing the last action. + + This method is called inside the step method. + """ + raise NotImplementedError + + def get_info(self, observations) -> Dict[Any, Any]: + r""".. + + :param observations: observations from simulator and task. + :return: info after performing the last action. + """ + raise NotImplementedError + + @profiling_wrapper.RangeContext("RLEnv.step") + def step(self, *args, **kwargs) -> Tuple[Observations, Any, bool, dict]: + r"""Perform an action in the environment. + + :return: :py:`(observations, reward, done, info)` + """ + + observations = self._env.step(*args, **kwargs) + reward = self.get_reward(observations) + done = self.get_done(observations) + info = self.get_info(observations) + + return observations, reward, done, info + + def seed(self, seed: Optional[int] = None) -> None: + self._env.seed(seed) + + def render(self, mode: str = "rgb") -> np.ndarray: + return self._env.render(mode) + + def close(self) -> None: + self._env.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() diff --git a/habitat-lab-dialog/habitat/core/logging.py b/habitat-lab-dialog/habitat/core/logging.py new file mode 100644 index 0000000..a545803 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/logging.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import logging + + +class HabitatLogger(logging.Logger): + def __init__( + self, + name, + level, + filename=None, + filemode="a", + stream=None, + format_str=None, + dateformat=None, + style="%", + ): + super().__init__(name, level) + if filename is not None: + handler = logging.FileHandler(filename, filemode) # type:ignore + else: + handler = logging.StreamHandler(stream) # type:ignore + self._formatter = logging.Formatter(format_str, dateformat, style) + handler.setFormatter(self._formatter) + super().addHandler(handler) + + def add_filehandler(self, log_filename): + filehandler = logging.FileHandler(log_filename) + filehandler.setFormatter(self._formatter) + self.addHandler(filehandler) + + +logger = HabitatLogger( + name="habitat", level=logging.INFO, format_str="%(asctime)-15s %(message)s" +) diff --git a/habitat-lab-dialog/habitat/core/registry.py b/habitat-lab-dialog/habitat/core/registry.py new file mode 100644 index 0000000..8d5de05 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/registry.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r"""Registry is central source of truth in Habitat. + +Taken from Pythia, it is inspired from Redux's concept of global store. +Registry maintains mappings of various information to unique keys. Special +functions in registry can be used as decorators to register different kind of +classes. + +Import the global registry object using + +.. code:: py + + from habitat.core.registry import registry + +Various decorators for registry different kind of classes with unique keys + +- Register a task: ``@registry.register_task`` +- Register a task action: ``@registry.register_task_action`` +- Register a simulator: ``@registry.register_simulator`` +- Register a sensor: ``@registry.register_sensor`` +- Register a measure: ``@registry.register_measure`` +- Register a dataset: ``@registry.register_dataset`` +""" + +import collections +from typing import Any, Callable, DefaultDict, Optional, Type + +from habitat.core.dataset import Dataset +from habitat.core.embodied_task import Action, EmbodiedTask, Measure +from habitat.core.simulator import ActionSpaceConfiguration, Sensor, Simulator +from habitat.core.utils import Singleton + + +class Registry(metaclass=Singleton): + mapping: DefaultDict[str, Any] = collections.defaultdict(dict) + + @classmethod + def _register_impl( + cls, + _type: str, + to_register: Optional[Any], + name: Optional[str], + assert_type: Optional[Type] = None, + ) -> Callable: + def wrap(to_register): + if assert_type is not None: + assert issubclass( + to_register, assert_type + ), "{} must be a subclass of {}".format( + to_register, assert_type + ) + register_name = to_register.__name__ if name is None else name + + cls.mapping[_type][register_name] = to_register + return to_register + + if to_register is None: + return wrap + else: + return wrap(to_register) + + @classmethod + def register_task(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a task to registry with key :p:`name` + + :param name: Key with which the task will be registered. + If :py:`None` will use the name of the class + + .. code:: py + + from habitat.core.registry import registry + from habitat.core.embodied_task import EmbodiedTask + + @registry.register_task + class MyTask(EmbodiedTask): + pass + + + # or + + @registry.register_task(name="MyTaskName") + class MyTask(EmbodiedTask): + pass + + """ + + return cls._register_impl( + "task", to_register, name, assert_type=EmbodiedTask + ) + + @classmethod + def register_simulator( + cls, to_register: None = None, *, name: Optional[str] = None + ): + r"""Register a simulator to registry with key :p:`name` + + :param name: Key with which the simulator will be registered. + If :py:`None` will use the name of the class + + .. code:: py + + from habitat.core.registry import registry + from habitat.core.simulator import Simulator + + @registry.register_simulator + class MySimulator(Simulator): + pass + + + # or + + @registry.register_simulator(name="MySimName") + class MySimulator(Simulator): + pass + + """ + + return cls._register_impl( + "sim", to_register, name, assert_type=Simulator + ) + + @classmethod + def register_sensor(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a sensor to registry with key :p:`name` + + :param name: Key with which the sensor will be registered. + If :py:`None` will use the name of the class + """ + + return cls._register_impl( + "sensor", to_register, name, assert_type=Sensor + ) + + @classmethod + def register_measure(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a measure to registry with key :p:`name` + + :param name: Key with which the measure will be registered. + If :py:`None` will use the name of the class + """ + + return cls._register_impl( + "measure", to_register, name, assert_type=Measure + ) + + @classmethod + def register_task_action( + cls, to_register=None, *, name: Optional[str] = None + ): + r"""Add a task action in this registry under key 'name' + + :param action_space: An action space that describes parameters to the + task action's method. If :py:`None` then the task action's method + takes no parameters. + :param name: Key with which the task action will be registered. If + :py:`None` will use the name of the task action's method. + """ + + return cls._register_impl( + "task_action", to_register, name, assert_type=Action + ) + + @classmethod + def register_dataset(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a dataset to registry with key :p:`name` + + :param name: Key with which the dataset will be registered. + If :py:`None` will use the name of the class + """ + + return cls._register_impl( + "dataset", to_register, name, assert_type=Dataset + ) + + @classmethod + def register_action_space_configuration( + cls, to_register=None, *, name: Optional[str] = None + ): + r"""Register a action space configuration to registry with key :p:`name` + + :param name: Key with which the action space will be registered. + If :py:`None` will use the name of the class + """ + + return cls._register_impl( + "action_space_config", + to_register, + name, + assert_type=ActionSpaceConfiguration, + ) + + @classmethod + def _get_impl(cls, _type: str, name: str) -> Type: + return cls.mapping[_type].get(name, None) + + @classmethod + def get_task(cls, name: str) -> Type[EmbodiedTask]: + return cls._get_impl("task", name) + + @classmethod + def get_task_action(cls, name: str) -> Type[Action]: + return cls._get_impl("task_action", name) + + @classmethod + def get_simulator(cls, name: str) -> Type[Simulator]: + return cls._get_impl("sim", name) + + @classmethod + def get_sensor(cls, name: str) -> Type[Sensor]: + return cls._get_impl("sensor", name) + + @classmethod + def get_measure(cls, name: str) -> Type[Measure]: + return cls._get_impl("measure", name) + + @classmethod + def get_dataset(cls, name: str) -> Type[Dataset]: + return cls._get_impl("dataset", name) + + @classmethod + def get_action_space_configuration( + cls, name: str + ) -> Type[ActionSpaceConfiguration]: + return cls._get_impl("action_space_config", name) + + +registry = Registry() diff --git a/habitat-lab-dialog/habitat/core/simulator.py b/habitat-lab-dialog/habitat/core/simulator.py new file mode 100644 index 0000000..0f1a6b3 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/simulator.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import abc +from collections import OrderedDict +from enum import Enum +from typing import Any, Dict, Iterable, List, Optional, Sequence, Union + +import attr +import numpy as np +import torch +from gym import Space, spaces + +from habitat.config import Config +from habitat.core.dataset import Episode + +VisualObservation = Union[torch.Tensor, np.ndarray] + + +@attr.s(auto_attribs=True) +class ActionSpaceConfiguration(metaclass=abc.ABCMeta): + config: Config + + @abc.abstractmethod + def get(self) -> Any: + raise NotImplementedError + + +class SensorTypes(Enum): + r"""Enumeration of types of sensors.""" + + NULL = 0 + COLOR = 1 + DEPTH = 2 + NORMAL = 3 + SEMANTIC = 4 + PATH = 5 + POSITION = 6 + FORCE = 7 + TENSOR = 8 + TEXT = 9 + MEASUREMENT = 10 + HEADING = 11 + TACTILE = 12 + TOKEN_IDS = 13 + + +class Sensor(metaclass=abc.ABCMeta): + r"""Represents a sensor that provides data from the environment to agent. + + :data uuid: universally unique id. + :data sensor_type: type of Sensor, use SensorTypes enum if your sensor + comes under one of it's categories. + :data observation_space: ``gym.Space`` object corresponding to observation + of sensor. + + The user of this class needs to implement the get_observation method and + the user is also required to set the below attributes: + """ + + uuid: str + config: Config + sensor_type: SensorTypes + observation_space: Space + + def __init__(self, *args: Any, **kwargs: Any) -> None: + self.config = kwargs["config"] if "config" in kwargs else None + if hasattr(self.config, "UUID"): + # We allow any sensor config to override the UUID + self.uuid = self.config.UUID + else: + self.uuid = self._get_uuid(*args, **kwargs) + self.sensor_type = self._get_sensor_type(*args, **kwargs) + self.observation_space = self._get_observation_space(*args, **kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + raise NotImplementedError + + def _get_sensor_type(self, *args: Any, **kwargs: Any) -> SensorTypes: + raise NotImplementedError + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Space: + raise NotImplementedError + + @abc.abstractmethod + def get_observation(self, *args: Any, **kwargs: Any) -> Any: + r""" + Returns: + current observation for Sensor. + """ + raise NotImplementedError + + +class Observations(Dict[str, Any]): + r"""Dictionary containing sensor observations""" + + def __init__( + self, sensors: Dict[str, Sensor], *args: Any, **kwargs: Any + ) -> None: + """Constructor + + :param sensors: list of sensors whose observations are fetched and + packaged. + """ + + data = [ + (uuid, sensor.get_observation(*args, **kwargs)) + for uuid, sensor in sensors.items() + ] + super().__init__(data) + + +class RGBSensor(Sensor, metaclass=abc.ABCMeta): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "rgb" + + def _get_sensor_type(self, *args: Any, **kwargs: Any) -> SensorTypes: + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Space: + raise NotImplementedError + + def get_observation(self, *args: Any, **kwargs: Any) -> VisualObservation: + raise NotImplementedError + + +class DepthSensor(Sensor, metaclass=abc.ABCMeta): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "depth" + + def _get_sensor_type(self, *args: Any, **kwargs: Any) -> SensorTypes: + return SensorTypes.DEPTH + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Space: + raise NotImplementedError + + def get_observation(self, *args: Any, **kwargs: Any) -> VisualObservation: + raise NotImplementedError + + +class SemanticSensor(Sensor): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "semantic" + + def _get_sensor_type(self, *args: Any, **kwargs: Any) -> SensorTypes: + return SensorTypes.SEMANTIC + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Space: + raise NotImplementedError + + def get_observation(self, *args: Any, **kwargs: Any) -> VisualObservation: + raise NotImplementedError + + +class BumpSensor(Sensor): + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "bump" + + def _get_sensor_type(self, *args: Any, **kwargs: Any) -> SensorTypes: + return SensorTypes.FORCE + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Space: + raise NotImplementedError + + def get_observation(self, *args: Any, **kwargs: Any) -> Any: + raise NotImplementedError + + +class SensorSuite: + r"""Represents a set of sensors, with each sensor being identified + through a unique id. + """ + + sensors: Dict[str, Sensor] + observation_spaces: spaces.Dict + + def __init__(self, sensors: Iterable[Sensor]) -> None: + """Constructor + + :param sensors: list containing sensors for the environment, uuid of + each sensor must be unique. + """ + self.sensors = OrderedDict() + ordered_spaces: OrderedDict[str, Space] = OrderedDict() + for sensor in sensors: + assert ( + sensor.uuid not in self.sensors + ), "'{}' is duplicated sensor uuid".format(sensor.uuid) + self.sensors[sensor.uuid] = sensor + ordered_spaces[sensor.uuid] = sensor.observation_space + self.observation_spaces = spaces.Dict(spaces=ordered_spaces) + + def get(self, uuid: str) -> Sensor: + return self.sensors[uuid] + + def get_observations(self, *args: Any, **kwargs: Any) -> Observations: + r"""Collects data from all sensors and returns it packaged inside + :ref:`Observations`. + """ + return Observations(self.sensors, *args, **kwargs) + + +@attr.s(auto_attribs=True) +class AgentState: + position: Optional["np.ndarray"] + rotation: Optional["np.ndarray"] = None + + +@attr.s(auto_attribs=True) +class ShortestPathPoint: + position: List[Any] + rotation: List[Any] + action: Optional[int] = None + + +class Simulator: + r"""Basic simulator class for habitat. New simulators to be added to habtiat + must derive from this class and implement the abstarct methods. + """ + habitat_config: Config + + def __init__(self, *args, **kwargs) -> None: + pass + + @property + def sensor_suite(self) -> SensorSuite: + raise NotImplementedError + + @property + def action_space(self) -> Space: + raise NotImplementedError + + def reset(self) -> Observations: + r"""resets the simulator and returns the initial observations. + + :return: initial observations from simulator. + """ + raise NotImplementedError + + def step(self, action, *args, **kwargs) -> Observations: + r"""Perform an action in the simulator and return observations. + + :param action: action to be performed inside the simulator. + :return: observations after taking action in simulator. + """ + raise NotImplementedError + + def seed(self, seed: int) -> None: + raise NotImplementedError + + def reconfigure(self, config: Config) -> None: + raise NotImplementedError + + def geodesic_distance( + self, + position_a: Sequence[float], + position_b: Union[Sequence[float], Sequence[Sequence[float]]], + episode: Optional[Episode] = None, + ) -> float: + r"""Calculates geodesic distance between two points. + + :param position_a: coordinates of first point. + :param position_b: coordinates of second point or list of goal points + coordinates. + :param episode: The episode with these ends points. This is used for + shortest path computation caching + :return: + the geodesic distance in the cartesian space between points + :p:`position_a` and :p:`position_b`, if no path is found between + the points then :ref:`math.inf` is returned. + """ + raise NotImplementedError + + def get_agent_state(self, agent_id: int = 0) -> AgentState: + r""".. + + :param agent_id: id of agent. + :return: state of agent corresponding to :p:`agent_id`. + """ + raise NotImplementedError + + def get_observations_at( + self, + position: List[float], + rotation: List[float], + keep_agent_at_new_pose: bool = False, + ) -> Optional[Observations]: + """Returns the observation. + + :param position: list containing 3 entries for :py:`(x, y, z)`. + :param rotation: list with 4 entries for :py:`(x, y, z, w)` elements + of unit quaternion (versor) representing agent 3D orientation, + (https://en.wikipedia.org/wiki/Versor) + :param keep_agent_at_new_pose: If true, the agent will stay at the + requested location. Otherwise it will return to where it started. + :return: + The observations or :py:`None` if it was unable to get valid + observations. + + """ + raise NotImplementedError + + def sample_navigable_point(self) -> List[float]: + r"""Samples a navigable point from the simulator. A point is defined as + navigable if the agent can be initialized at that point. + + :return: navigable point. + """ + raise NotImplementedError + + def is_navigable(self, point: List[float]) -> bool: + r"""Return :py:`True` if the agent can stand at the specified point. + + :param point: the point to check. + """ + raise NotImplementedError + + def action_space_shortest_path( + self, source: AgentState, targets: List[AgentState], agent_id: int = 0 + ) -> List[ShortestPathPoint]: + r"""Calculates the shortest path between source and target agent + states. + + :param source: source agent state for shortest path calculation. + :param targets: target agent state(s) for shortest path calculation. + :param agent_id: id for agent (relevant for multi-agent setup). + :return: list of agent states and actions along the shortest path from + source to the nearest target (both included). + """ + raise NotImplementedError + + def get_straight_shortest_path_points( + self, position_a: List[float], position_b: List[float] + ) -> List[List[float]]: + r"""Returns points along the geodesic (shortest) path between two + points irrespective of the angles between the waypoints. + + :param position_a: the start point. This will be the first point in + the returned list. + :param position_b: the end point. This will be the last point in the + returned list. + :return: a list of waypoints :py:`(x, y, z)` on the geodesic path + between the two points. + """ + + raise NotImplementedError + + @property + def up_vector(self) -> "np.ndarray": + r"""The vector representing the direction upward (perpendicular to the + floor) from the global coordinate frame. + """ + raise NotImplementedError + + @property + def forward_vector(self) -> "np.ndarray": + r"""The forward direction in the global coordinate frame i.e. the + direction of forward movement for an agent with 0 degrees rotation in + the ground plane. + """ + raise NotImplementedError + + def render(self, mode: str = "rgb") -> Any: + raise NotImplementedError + + def close(self) -> None: + pass + + def previous_step_collided(self) -> bool: + r"""Whether or not the previous step resulted in a collision + + :return: :py:`True` if the previous step resulted in a collision, + :py:`False` otherwise + """ + raise NotImplementedError + + def __enter__(self) -> "Simulator": + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() diff --git a/habitat-lab-dialog/habitat/core/spaces.py b/habitat-lab-dialog/habitat/core/spaces.py new file mode 100644 index 0000000..5b4e6e2 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/spaces.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict +from collections.abc import Collection +from typing import Dict, List, Union + +import gym +from gym import Space + + +class EmptySpace(Space): + """ + A ``gym.Space`` that reflects arguments space for action that doesn't have + arguments. Needed for consistency ang always samples `None` value. + """ + + def sample(self): + return None + + def contains(self, x): + if x is None: + return True + return False + + def __repr__(self): + return "EmptySpace()" + + +class ActionSpace(gym.spaces.Dict): + """ + A dictionary of ``EmbodiedTask`` actions and their argument spaces. + + .. code:: py + + self.observation_space = spaces.ActionSpace({ + "move": spaces.Dict({ + "position": spaces.Discrete(2), + "velocity": spaces.Discrete(3) + }), + "move_forward": EmptySpace(), + }) + """ + + def __init__(self, spaces: Union[List, Dict]): + if isinstance(spaces, dict): + self.spaces = OrderedDict(sorted(spaces.items())) + if isinstance(spaces, list): + self.spaces = OrderedDict(spaces) + self.actions_select = gym.spaces.Discrete(len(self.spaces)) + + @property + def n(self) -> int: + return len(self.spaces) + + def sample(self): + action_index = self.actions_select.sample() + return { + "action": list(self.spaces.keys())[action_index], + "action_args": list(self.spaces.values())[action_index].sample(), + } + + def contains(self, x): + if not isinstance(x, dict) or "action" not in x: + return False + if x["action"] not in self.spaces: + return False + if not self.spaces[x["action"]].contains(x.get("action_args", None)): + return False + return True + + def __repr__(self): + return ( + "ActionSpace(" + + ", ".join([k + ":" + str(s) for k, s in self.spaces.items()]) + + ")" + ) + + +class ListSpace(Space): + """ + A ``gym.Space`` that describes a list of other Space. Used to describe + list of tokens ids, vectors and etc. + + .. code:: py + + observation_space = ListSpace(spaces.Discrete( + dataset.question_vocab.get_size())) + """ + + def __init__( + self, + space: Space, + min_seq_length: int = 0, + max_seq_length: int = 1 << 15, + ): + self.min_seq_length = min_seq_length + self.max_seq_length = max_seq_length + self.space = space + self.length_select = gym.spaces.Discrete( + max_seq_length - min_seq_length + ) + + def sample(self): + seq_length = self.length_select.sample() + self.min_seq_length + return [self.space.sample() for _ in range(seq_length)] + + def contains(self, x): + if not isinstance(x, Collection): + return False + + if not (self.min_seq_length <= len(x) <= self.max_seq_length): + return False + + return all(self.space.contains(el) for el in x) + + def __repr__(self): + return ( + f"ListSpace({self.space}, min_seq_length=" + f"{self.min_seq_length}, max_seq_length={self.max_seq_length})" + ) diff --git a/habitat-lab-dialog/habitat/core/utils.py b/habitat-lab-dialog/habitat/core/utils.py new file mode 100644 index 0000000..871a659 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/utils.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import cmath +import json +import math +from typing import Any, Dict, List, Optional + +import numpy as np +import quaternion # noqa: F401 + +from habitat.utils.geometry_utils import quaternion_to_list + +# Internals from inner json library needed for patching functionality in +# DatasetFloatJSONEncoder. +try: + from _json import encode_basestring_ascii # type: ignore +except ImportError: + encode_basestring_ascii = None # type: ignore +try: + from _json import encode_basestring # type: ignore +except ImportError: + encode_basestring = None # type: ignore + + +def tile_images(images: List[np.ndarray]) -> np.ndarray: + r"""Tile multiple images into single image + + Args: + images: list of images where each image has dimension + (height x width x channels) + + Returns: + tiled image (new_height x width x channels) + """ + assert len(images) > 0, "empty list of images" + np_images = np.asarray(images) + n_images, height, width, n_channels = np_images.shape + new_height = int(np.ceil(np.sqrt(n_images))) + new_width = int(np.ceil(float(n_images) / new_height)) + # pad with empty images to complete the rectangle + np_images = np.array( + images + + [images[0] * 0 for _ in range(n_images, new_height * new_width)] + ) + # img_HWhwc + out_image = np_images.reshape( + new_height, new_width, height, width, n_channels + ) + # img_HhWwc + out_image = out_image.transpose(0, 2, 1, 3, 4) + # img_Hh_Ww_c + out_image = out_image.reshape( + new_height * height, new_width * width, n_channels + ) + return out_image + + +def not_none_validator( + self: Any, attribute: Any, value: Optional[Any] +) -> None: + if value is None: + raise ValueError(f"Argument '{attribute.name}' must be set") + + +def try_cv2_import(): + r"""The PyRobot python3 version which is a dependency of Habitat-PyRobot integration + relies on ROS running in python2.7. In order to import cv2 in python3 we need to remove + the python2.7 path from sys.path. To use the Habitat-PyRobot integration the user + needs to export environment variable ROS_PATH which will look something like: + /opt/ros/kinetic/lib/python2.7/dist-packages + """ + import os + import sys + + ros_path = os.environ.get("ROS_PATH") + if ros_path is not None and ros_path in sys.path: + sys.path.remove(ros_path) + import cv2 + + sys.path.append(ros_path) + else: + import cv2 + + return cv2 + + +class Singleton(type): + _instances: Dict["Singleton", "Singleton"] = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__( + *args, **kwargs + ) + return cls._instances[cls] + + +def center_crop(obs, new_shape): + top_left = ( + (obs.shape[0] // 2) - (new_shape[0] // 2), + (obs.shape[1] // 2) - (new_shape[1] // 2), + ) + bottom_right = ( + (obs.shape[0] // 2) + (new_shape[0] // 2), + (obs.shape[1] // 2) + (new_shape[1] // 2), + ) + obs = obs[top_left[0] : bottom_right[0], top_left[1] : bottom_right[1], :] + + return obs + + +class DatasetFloatJSONEncoder(json.JSONEncoder): + r"""JSON Encoder that sets a float precision for a space saving purpose and + encodes ndarray and quaternion. The encoder is compatible with JSON + version 2.0.9. + """ + + def default(self, obj): + # JSON doesn't support numpy ndarray and quaternion + if isinstance(obj, np.ndarray): + return obj.tolist() + if isinstance(obj, np.quaternion): + return quaternion_to_list(obj) + + return ( + obj.__getstate__() + if hasattr(obj, "__getstate__") + else obj.__dict__ + ) + + # Overriding method to inject own `_repr` function for floats with needed + # precision. + def iterencode(self, o, _one_shot=False): + + markers: Optional[Dict] = {} if self.check_circular else None + if self.ensure_ascii: + _encoder = encode_basestring_ascii + else: + _encoder = encode_basestring + + def floatstr( + o, + allow_nan=self.allow_nan, + _repr=lambda x: format(x, ".5f"), + _inf=math.inf, + _neginf=-math.inf, + ): + if cmath.isnan(o): + text = "NaN" + elif o == _inf: + text = "Infinity" + elif o == _neginf: + text = "-Infinity" + else: + return _repr(o) + + if not allow_nan: + raise ValueError( + "Out of range float values are not JSON compliant: " + + repr(o) + ) + + return text + + _iterencode = json.encoder._make_iterencode( # type: ignore + markers, + self.default, + _encoder, + self.indent, + floatstr, + self.key_separator, + self.item_separator, + self.sort_keys, + self.skipkeys, + _one_shot, + ) + return _iterencode(o, 0) diff --git a/habitat-lab-dialog/habitat/core/vector_env.py b/habitat-lab-dialog/habitat/core/vector_env.py new file mode 100644 index 0000000..8dcefa1 --- /dev/null +++ b/habitat-lab-dialog/habitat/core/vector_env.py @@ -0,0 +1,759 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import signal +import warnings +from multiprocessing.connection import Connection +from multiprocessing.context import BaseContext +from queue import Queue +from threading import Thread +from typing import ( + Any, + Callable, + Dict, + Iterator, + List, + Optional, + Sequence, + Set, + Tuple, + Union, + cast, +) + +import attr +import gym +import numpy as np +from gym import spaces + +import habitat +from habitat.config import Config +from habitat.core.env import Env, RLEnv +from habitat.core.logging import logger +from habitat.core.utils import tile_images +from habitat.utils import profiling_wrapper +from habitat.utils.pickle5_multiprocessing import ConnectionWrapper + +try: + # Use torch.multiprocessing if we can. + # We have yet to find a reason to not use it and + # you are required to use it when sending a torch.Tensor + # between processes + import torch + from torch import multiprocessing as mp # type:ignore +except ImportError: + torch = None + import multiprocessing as mp # type:ignore + + +STEP_COMMAND = "step" +RESET_COMMAND = "reset" +RENDER_COMMAND = "render" +CLOSE_COMMAND = "close" +CALL_COMMAND = "call" +COUNT_EPISODES_COMMAND = "count_episodes" + +EPISODE_OVER_NAME = "episode_over" +GET_METRICS_NAME = "get_metrics" +CURRENT_EPISODE_NAME = "current_episode" +NUMBER_OF_EPISODE_NAME = "number_of_episodes" +ACTION_SPACE_NAME = "action_space" +OBSERVATION_SPACE_NAME = "observation_space" + +# ---------- +STATE_COMMAND = 'state' +IS_NEW_EPISODE_COMMAND = 'is_new_episode' +ORACLE_ACTION_COMMAND = 'o_action' +QUERY_NUM_COMMAND = 'query_num' +IS_QUERIED_NUM_COMMAND = 'is_queried' +ENV_ID_COMMAND = 'which_env' +CONSTRAINT_COMMAND = 'cons_constraint' + + +def _make_env_fn( + config: Config, dataset: Optional[habitat.Dataset] = None, rank: int = 0 +) -> Env: + """Constructor for default habitat :ref:`env.Env`. + + :param config: configuration for environment. + :param dataset: dataset for environment. + :param rank: rank for setting seed of environment + :return: :ref:`env.Env` / :ref:`env.RLEnv` object + """ + habitat_env = Env(config=config, dataset=dataset) + habitat_env.seed(config.SEED + rank) + return habitat_env + + +@attr.s(auto_attribs=True, slots=True) +class _ReadWrapper: + r"""Convenience wrapper to track if a connection to a worker process + should have something to read. + """ + read_fn: Callable[[], Any] + rank: int + is_waiting: bool = False + + def __call__(self) -> Any: + if not self.is_waiting: + raise RuntimeError( + f"Tried to read from process {self.rank}" + " but there is nothing waiting to be read" + ) + res = self.read_fn() + self.is_waiting = False + + return res + + +@attr.s(auto_attribs=True, slots=True) +class _WriteWrapper: + r"""Convenience wrapper to track if a connection to a worker process + can be written to safely. In other words, checks to make sure the + result returned from the last write was read. + """ + write_fn: Callable[[Any], None] + read_wrapper: _ReadWrapper + + def __call__(self, data: Any) -> None: + if self.read_wrapper.is_waiting: + raise RuntimeError( + f"Tried to write to process {self.read_wrapper.rank}" + " but the last write has not been read" + ) + self.write_fn(data) + self.read_wrapper.is_waiting = True + + +class VectorEnv: + r"""Vectorized environment which creates multiple processes where each + process runs its own environment. Main class for parallelization of + training and evaluation. + + + All the environments are synchronized on step and reset methods. + """ + + observation_spaces: List[spaces.Dict] + number_of_episodes: List[Optional[int]] + action_spaces: List[spaces.Dict] + _workers: List[Union[mp.Process, Thread]] + _num_envs: int + _auto_reset_done: bool + _mp_ctx: BaseContext + _connection_read_fns: List[_ReadWrapper] + _connection_write_fns: List[_WriteWrapper] + + def __init__( + self, + make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn, + env_fn_args: Sequence[Tuple] = None, + auto_reset_done: bool = True, + multiprocessing_start_method: str = "forkserver", + workers_ignore_signals: bool = False, + ) -> None: + """.. + + :param make_env_fn: function which creates a single environment. An + environment can be of type :ref:`env.Env` or :ref:`env.RLEnv` + :param env_fn_args: tuple of tuple of args to pass to the + :ref:`_make_env_fn`. + :param auto_reset_done: automatically reset the environment when + done. This functionality is provided for seamless training + of vectorized environments. + :param multiprocessing_start_method: the multiprocessing method used to + spawn worker processes. Valid methods are + :py:`{'spawn', 'forkserver', 'fork'}`; :py:`'forkserver'` is the + recommended method as it works well with CUDA. If :py:`'fork'` is + used, the subproccess must be started before any other GPU useage. + :param workers_ignore_signals: Whether or not workers will ignore SIGINT and SIGTERM + and instead will only exit when :ref:`close` is called + """ + self._is_closed = True + + assert ( + env_fn_args is not None and len(env_fn_args) > 0 + ), "number of environments to be created should be greater than 0" + + self._num_envs = len(env_fn_args) + + assert multiprocessing_start_method in self._valid_start_methods, ( + "multiprocessing_start_method must be one of {}. Got '{}'" + ).format(self._valid_start_methods, multiprocessing_start_method) + self._auto_reset_done = auto_reset_done + self._mp_ctx = mp.get_context(multiprocessing_start_method) + self._workers = [] + ( + self._connection_read_fns, + self._connection_write_fns, + ) = self._spawn_workers( # noqa + env_fn_args, + make_env_fn, + workers_ignore_signals=workers_ignore_signals, + ) + + self._is_closed = False + + for write_fn in self._connection_write_fns: + write_fn((CALL_COMMAND, (OBSERVATION_SPACE_NAME, None))) + self.observation_spaces = [ + read_fn() for read_fn in self._connection_read_fns + ] + for write_fn in self._connection_write_fns: + write_fn((CALL_COMMAND, (ACTION_SPACE_NAME, None))) + self.action_spaces = [ + read_fn() for read_fn in self._connection_read_fns + ] + for write_fn in self._connection_write_fns: + write_fn((CALL_COMMAND, (NUMBER_OF_EPISODE_NAME, None))) + self.number_of_episodes = [ + read_fn() for read_fn in self._connection_read_fns + ] + self._paused: List[Tuple] = [] + + @property + def num_envs(self): + r"""number of individual environments.""" + return self._num_envs - len(self._paused) + + @staticmethod + @profiling_wrapper.RangeContext("_worker_env") + def _worker_env( + connection_read_fn: Callable, + connection_write_fn: Callable, + env_fn: Callable, + env_fn_args: Tuple[Any], + auto_reset_done: bool, + mask_signals: bool = False, + child_pipe: Optional[Connection] = None, + parent_pipe: Optional[Connection] = None, + ) -> None: + r"""process worker for creating and interacting with the environment.""" + if mask_signals: + signal.signal(signal.SIGINT, signal.SIG_IGN) + signal.signal(signal.SIGTERM, signal.SIG_IGN) + + signal.signal(signal.SIGUSR1, signal.SIG_IGN) + signal.signal(signal.SIGUSR2, signal.SIG_IGN) + + env = env_fn(*env_fn_args) + if parent_pipe is not None: + parent_pipe.close() + try: + command, data = connection_read_fn() + while command != CLOSE_COMMAND: + if command == STEP_COMMAND: + # different step methods for habitat.RLEnv and habitat.Env + if isinstance(env, (habitat.RLEnv, gym.Env)): + # habitat.RLEnv + observations, reward, done, info = env.step(**data) + if auto_reset_done and done: + observations = env.reset() + with profiling_wrapper.RangeContext( + "worker write after step" + ): + connection_write_fn( + (observations, reward, done, info) + ) + elif isinstance(env, habitat.Env): # type: ignore + # habitat.Env + observations = env.step(**data) + if auto_reset_done and env.episode_over: + observations = env.reset() + connection_write_fn(observations) + else: + raise NotImplementedError + + elif command == RESET_COMMAND: + observations = env.reset() + connection_write_fn(observations) + + elif command == RENDER_COMMAND: + connection_write_fn(env.render(*data[0], **data[1])) + + elif command == CALL_COMMAND: + function_name, function_args = data + if function_args is None: + function_args = {} + + result_or_fn = getattr(env, function_name) + + if len(function_args) > 0 or callable(result_or_fn): + result = result_or_fn(**function_args) + else: + result = result_or_fn + + connection_write_fn(result) + + elif command == COUNT_EPISODES_COMMAND: + connection_write_fn(len(env.episodes)) + + # -------------------------- + elif command == STATE_COMMAND: + connection_write_fn(env.agent_state()) + + elif command == IS_NEW_EPISODE_COMMAND: + connection_write_fn(env._new_episode) + + elif command == ORACLE_ACTION_COMMAND: + connection_write_fn(env.compute_oracle_actions()) + + # ---------------------------- + + elif command == QUERY_NUM_COMMAND: + connection_write_fn(env.set_query_num(data)) + + + elif command == CONSTRAINT_COMMAND: + connection_write_fn(env.set_constraint_reward(data)) + + elif command == IS_QUERIED_NUM_COMMAND: + connection_write_fn(env.set_idx(data)) + + elif command == ENV_ID_COMMAND: + connection_write_fn(env.set_is_queried(data)) + + + + + else: + raise NotImplementedError(f"Unknown command {command}") + + with profiling_wrapper.RangeContext("worker wait for command"): + command, data = connection_read_fn() + + except KeyboardInterrupt: + logger.info("Worker KeyboardInterrupt") + finally: + if child_pipe is not None: + child_pipe.close() + env.close() + + def _spawn_workers( + self, + env_fn_args: Sequence[Tuple], + make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn, + workers_ignore_signals: bool = False, + ) -> Tuple[List[_ReadWrapper], List[_WriteWrapper]]: + parent_connections, worker_connections = zip( + *[ + [ConnectionWrapper(c) for c in self._mp_ctx.Pipe(duplex=True)] + for _ in range(self._num_envs) + ] + ) + self._workers = [] + for worker_conn, parent_conn, env_args in zip( + worker_connections, parent_connections, env_fn_args + ): + ps = self._mp_ctx.Process( + target=self._worker_env, + args=( + worker_conn.recv, + worker_conn.send, + make_env_fn, + env_args, + self._auto_reset_done, + workers_ignore_signals, + worker_conn, + parent_conn, + ), + ) + self._workers.append(cast(mp.Process, ps)) + ps.daemon = True + ps.start() + worker_conn.close() + + read_fns = [ + _ReadWrapper(p.recv, rank) + for rank, p in enumerate(parent_connections) + ] + write_fns = [ + _WriteWrapper(p.send, read_fn) + for p, read_fn in zip(parent_connections, read_fns) + ] + + return read_fns, write_fns + + def current_episodes(self): + for write_fn in self._connection_write_fns: + write_fn((CALL_COMMAND, (CURRENT_EPISODE_NAME, None))) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + def count_episodes(self): + for write_fn in self._connection_write_fns: + write_fn((COUNT_EPISODES_COMMAND, None)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + # ----------------------- + def agent_state(self): + for write_fn in self._connection_write_fns: + write_fn((STATE_COMMAND, None)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + def compute_oracle_actions(self): + for write_fn in self._connection_write_fns: + write_fn((ORACLE_ACTION_COMMAND, None)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + def set_query_num_at(self, index_env: int, query_num: int): + self._connection_write_fns[index_env]((QUERY_NUM_COMMAND, query_num)) + self._connection_read_fns[index_env]() + + def set_constraint_reward_at(self, index_env: int, constraint_reward: float): + self._connection_write_fns[index_env]((CONSTRAINT_COMMAND, constraint_reward)) + self._connection_read_fns[index_env]() + + def set_idx_at(self, index_env: int, env_id: int): + self._connection_write_fns[index_env]((IS_QUERIED_NUM_COMMAND, env_id)) + self._connection_read_fns[index_env]() + + def set_is_queried_at(self, index_env: int, is_queried: bool): + self._connection_write_fns[index_env]((ENV_ID_COMMAND, is_queried)) + self._connection_read_fns[index_env]() + + def set_query_num(self, query_num_list: List[Any]): + for index_env, query_num in enumerate(query_num_list): + self.set_query_num_at(index_env, query_num) + + def set_constraint_reward(self, constraint_reward_list: List[Any]): + for index_env, constraint_reward in enumerate(constraint_reward_list): + self.set_constraint_reward_at(index_env, constraint_reward) + + + def set_idx(self, index_env_list: List[Any]): + for index_env, env_id in enumerate(index_env_list): + self.set_idx_at(index_env, env_id) + + def set_is_queried(self, is_queried_list: List[Any]): + for index_env, is_queried in enumerate(is_queried_list): + self.set_is_queried_at(index_env, is_queried) + + + + + + + + def is_new_episode(self): + for write_fn in self._connection_write_fns: + write_fn((IS_NEW_EPISODE_COMMAND, None)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + ''' + def shortest_path_actions(self): + for write_fn in self._connection_write_fns: + write_fn((ORACLE_ACTION_COMMAND, None)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + ''' + + def episode_over(self): + for write_fn in self._connection_write_fns: + write_fn((CALL_COMMAND, (EPISODE_OVER_NAME, None))) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + def get_metrics(self): + for write_fn in self._connection_write_fns: + write_fn((CALL_COMMAND, (GET_METRICS_NAME, None))) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + def reset(self): + r"""Reset all the vectorized environments + + :return: list of outputs from the reset method of envs. + """ + for write_fn in self._connection_write_fns: + write_fn((RESET_COMMAND, None)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + def reset_at(self, index_env: int): + r"""Reset in the index_env environment in the vector. + + :param index_env: index of the environment to be reset + :return: list containing the output of reset method of indexed env. + """ + self._connection_write_fns[index_env]((RESET_COMMAND, None)) + results = [self._connection_read_fns[index_env]()] + return results + + def async_step_at( + self, index_env: int, action: Union[int, str, Dict[str, Any]] + ) -> None: + # Backward compatibility + if isinstance(action, (int, np.integer, str)): + action = {"action": {"action": action}} + + self._warn_cuda_tensors(action) + self._connection_write_fns[index_env]((STEP_COMMAND, action)) + + @profiling_wrapper.RangeContext("wait_step_at") + def wait_step_at(self, index_env: int) -> Any: + return self._connection_read_fns[index_env]() + + def step_at(self, index_env: int, action: Union[int, str, Dict[str, Any]]): + r"""Step in the index_env environment in the vector. + + :param index_env: index of the environment to be stepped into + :param action: action to be taken + :return: list containing the output of step method of indexed env. + """ + self.async_step_at(index_env, action) + return self.wait_step_at(index_env) + + def async_step(self, data: List[Union[int, str, Dict[str, Any]]]) -> None: + r"""Asynchronously step in the environments. + + :param data: list of size _num_envs containing keyword arguments to + pass to :ref:`step` method for each Environment. For example, + :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`. + """ + + for index_env, act in enumerate(data): + self.async_step_at(index_env, act) + + @profiling_wrapper.RangeContext("wait_step") + def wait_step(self) -> List[Any]: + r"""Wait until all the asynchronized environments have synchronized.""" + return [ + self.wait_step_at(index_env) for index_env in range(self.num_envs) + ] + + def step(self, data: List[Union[int, str, Dict[str, Any]]]) -> List[Any]: + r"""Perform actions in the vectorized environments. + + :param data: list of size _num_envs containing keyword arguments to + pass to :ref:`step` method for each Environment. For example, + :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`. + :return: list of outputs from the step method of envs. + """ + self.async_step(data) + return self.wait_step() + + def close(self) -> None: + if self._is_closed: + return + + for read_fn in self._connection_read_fns: + if read_fn.is_waiting: + read_fn() + + for write_fn in self._connection_write_fns: + write_fn((CLOSE_COMMAND, None)) + + for _, _, write_fn, _ in self._paused: + write_fn((CLOSE_COMMAND, None)) + + for process in self._workers: + process.join() + + for _, _, _, process in self._paused: + process.join() + + self._is_closed = True + + def pause_at(self, index: int) -> None: + r"""Pauses computation on this env without destroying the env. + + :param index: which env to pause. All indexes after this one will be + shifted down by one. + + This is useful for not needing to call steps on all environments when + only some are active (for example during the last episodes of running + eval episodes). + """ + if self._connection_read_fns[index].is_waiting: + self._connection_read_fns[index]() + read_fn = self._connection_read_fns.pop(index) + write_fn = self._connection_write_fns.pop(index) + worker = self._workers.pop(index) + self._paused.append((index, read_fn, write_fn, worker)) + + def resume_all(self) -> None: + r"""Resumes any paused envs.""" + for index, read_fn, write_fn, worker in reversed(self._paused): + self._connection_read_fns.insert(index, read_fn) + self._connection_write_fns.insert(index, write_fn) + self._workers.insert(index, worker) + self._paused = [] + + def call_at( + self, + index: int, + function_name: str, + function_args: Optional[Dict[str, Any]] = None, + ) -> Any: + r"""Calls a function or retrieves a property/member variable (which is passed by name) + on the selected env and returns the result. + + :param index: which env to call the function on. + :param function_name: the name of the function to call or property to retrieve on the env. + :param function_args: optional function args. + :return: result of calling the function. + """ + self._connection_write_fns[index]( + (CALL_COMMAND, (function_name, function_args)) + ) + result = self._connection_read_fns[index]() + return result + + def call( + self, + function_names: List[str], + function_args_list: Optional[List[Any]] = None, + ) -> List[Any]: + r"""Calls a list of functions (which are passed by name) on the + corresponding env (by index). + + :param function_names: the name of the functions to call on the envs. + :param function_args_list: list of function args for each function. If + provided, :py:`len(function_args_list)` should be as long as + :py:`len(function_names)`. + :return: result of calling the function. + """ + if function_args_list is None: + function_args_list = [None] * len(function_names) + assert len(function_names) == len(function_args_list) + func_args = zip(function_names, function_args_list) + for write_fn, func_args_on in zip( + self._connection_write_fns, func_args + ): + write_fn((CALL_COMMAND, func_args_on)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + return results + + def render( + self, mode: str = "human", *args, **kwargs + ) -> Union[np.ndarray, None]: + r"""Render observations from all environments in a tiled image.""" + for write_fn in self._connection_write_fns: + write_fn((RENDER_COMMAND, (args, {"mode": "rgb", **kwargs}))) + images = [read_fn() for read_fn in self._connection_read_fns] + tile = tile_images(images) + if mode == "human": + from habitat.core.utils import try_cv2_import + + cv2 = try_cv2_import() + + cv2.imshow("vecenv", tile[:, :, ::-1]) + cv2.waitKey(1) + return None + elif mode == "rgb_array": + return tile + else: + raise NotImplementedError + + @property + def _valid_start_methods(self) -> Set[str]: + return {"forkserver", "spawn", "fork"} + + def _warn_cuda_tensors( + self, action: Dict[str, Any], prefix: Optional[str] = None + ): + if torch is None: + return + + for k, v in action.items(): + if isinstance(v, dict): + subk = f"{prefix}.{k}" if prefix is not None else k + self._warn_cuda_tensors(v, prefix=subk) + elif torch.is_tensor(v) and v.device.type == "cuda": + subk = f"{prefix}.{k}" if prefix is not None else k + warnings.warn( + "Action with key {} is a CUDA tensor." + " This will result in a CUDA context in the subproccess worker." + " Using CPU tensors instead is recommended.".format(subk) + ) + + def __del__(self): + self.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + + +class ThreadedVectorEnv(VectorEnv): + r"""Provides same functionality as :ref:`VectorEnv`, the only difference + is it runs in a multi-thread setup inside a single process. + + The :ref:`VectorEnv` runs in a multi-proc setup. This makes it much easier + to debug when using :ref:`VectorEnv` because you can actually put break + points in the environment methods. It should not be used for best + performance. + """ + + def _spawn_workers( + self, + env_fn_args: Sequence[Tuple], + make_env_fn: Callable[..., Env] = _make_env_fn, + workers_ignore_signals: bool = False, + ) -> Tuple[List[_ReadWrapper], List[_WriteWrapper]]: + queues: Iterator[Tuple[Any, ...]] = zip( + *[(Queue(), Queue()) for _ in range(self._num_envs)] + ) + parent_read_queues, parent_write_queues = queues + self._workers = [] + for parent_read_queue, parent_write_queue, env_args in zip( + parent_read_queues, parent_write_queues, env_fn_args + ): + thread = Thread( + target=self._worker_env, + args=( + parent_write_queue.get, + parent_read_queue.put, + make_env_fn, + env_args, + self._auto_reset_done, + ), + ) + self._workers.append(thread) + thread.daemon = True + thread.start() + + read_fns = [ + _ReadWrapper(q.get, rank) + for rank, q in enumerate(parent_read_queues) + ] + write_fns = [ + _WriteWrapper(q.put, read_wrapper) + for q, read_wrapper in zip(parent_write_queues, read_fns) + ] + return read_fns, write_fns diff --git a/habitat-lab-dialog/habitat/datasets/__init__.py b/habitat-lab-dialog/habitat/datasets/__init__.py new file mode 100644 index 0000000..ccdd0c9 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.datasets.registration import make_dataset # noqa: F401 . diff --git a/habitat-lab-dialog/habitat/datasets/eqa/__init__.py b/habitat-lab-dialog/habitat/datasets/eqa/__init__.py new file mode 100644 index 0000000..cad5682 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/eqa/__init__.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.dataset import Dataset +from habitat.core.registry import registry + + +def _try_register_mp3d_eqa_dataset(): + try: + from habitat.datasets.eqa.mp3d_eqa_dataset import ( # noqa: F401 isort:skip + Matterport3dDatasetV1, + ) + except ImportError as e: + mp3deqa_import_error = e + + @registry.register_dataset(name="MP3DEQA-v1") + class Matterport3dDatasetImportError(Dataset): + def __init__(self, *args, **kwargs): + raise mp3deqa_import_error diff --git a/habitat-lab-dialog/habitat/datasets/eqa/mp3d_eqa_dataset.py b/habitat-lab-dialog/habitat/datasets/eqa/mp3d_eqa_dataset.py new file mode 100644 index 0000000..6d75548 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/eqa/mp3d_eqa_dataset.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gzip +import json +import os +from typing import List, Optional + +from habitat.config import Config +from habitat.core.dataset import Dataset +from habitat.core.registry import registry +from habitat.core.simulator import AgentState +from habitat.datasets.utils import VocabDict +from habitat.tasks.eqa.eqa import EQAEpisode, QuestionData +from habitat.tasks.nav.nav import ShortestPathPoint +from habitat.tasks.nav.object_nav_task import ObjectGoal + +EQA_MP3D_V1_VAL_EPISODE_COUNT = 1950 +DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/" + + +def get_default_mp3d_v1_config(split: str = "val"): + config = Config() + config.name = "MP3DEQA-v1" + config.DATA_PATH = "data/datasets/eqa/mp3d/v1/{split}.json.gz" + config.SPLIT = split + return config + + +@registry.register_dataset(name="MP3DEQA-v1") +class Matterport3dDatasetV1(Dataset): + r"""Class inherited from Dataset that loads Matterport3D + Embodied Question Answering dataset. + + This class can then be used as follows:: + eqa_config.dataset = get_default_mp3d_v1_config() + eqa = habitat.make_task(eqa_config.task_name, config=eqa_config) + """ + + episodes: List[EQAEpisode] + answer_vocab: VocabDict + question_vocab: VocabDict + + @staticmethod + def check_config_paths_exist(config: Config) -> bool: + return os.path.exists(config.DATA_PATH.format(split=config.SPLIT)) + + def __init__(self, config: Config = None) -> None: + self.episodes = [] + + if config is None: + return + + with gzip.open(config.DATA_PATH.format(split=config.SPLIT), "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR) + + self.episodes = list( + filter(self.build_content_scenes_filter(config), self.episodes) + ) + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None + ) -> None: + deserialized = json.loads(json_str) + self.__dict__.update( + deserialized + ) # This is a messy hack... Why do we do this. + self.answer_vocab = VocabDict( + word_list=self.answer_vocab["word_list"] # type: ignore + ) + self.question_vocab = VocabDict( + word_list=self.question_vocab["word_list"] # type: ignore + ) + + for ep_index, episode in enumerate(deserialized["episodes"]): + episode = EQAEpisode(**episode) + if scenes_dir is not None: + if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): + episode.scene_id = episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX) : + ] + episode.scene_id = os.path.join(scenes_dir, episode.scene_id) + episode.question = QuestionData(**episode.question) + for g_index, goal in enumerate(episode.goals): + episode.goals[g_index] = ObjectGoal(**goal) + new_goal = episode.goals[g_index] + if new_goal.view_points is not None: + for p_index, agent_state in enumerate( + new_goal.view_points + ): + new_goal.view_points[p_index] = AgentState( + **agent_state + ) + if episode.shortest_paths is not None: + for path in episode.shortest_paths: + for p_index, point in enumerate(path): + path[p_index] = ShortestPathPoint(**point) + self.episodes[ep_index] = episode diff --git a/habitat-lab-dialog/habitat/datasets/object_nav/__init__.py b/habitat-lab-dialog/habitat/datasets/object_nav/__init__.py new file mode 100644 index 0000000..b7e831d --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/object_nav/__init__.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.dataset import Dataset +from habitat.core.registry import registry + + +# TODO(akadian): This is a result of moving SimulatorActions away from core +# and into simulators specifically. As a result of that the connection points +# for our tasks and datasets for actions is coming from inside habitat-sim +# which makes it impossible for anyone to use habitat-lab without having +# habitat-sim installed. In a future PR we will implement a base simulator +# action class which will be the connection point for tasks and datasets. +# Post that PR we would no longer need try register blocks. +def _try_register_objectnavdatasetv1(): + try: + from habitat.datasets.object_nav.object_nav_dataset import ( # noqa: F401 + ObjectNavDatasetV1, + ) + + except ImportError as e: + pointnav_import_error = e + + @registry.register_dataset(name="ObjectNav-v1") + class ObjectNavDatasetImportError(Dataset): + def __init__(self, *args, **kwargs): + raise pointnav_import_error diff --git a/habitat-lab-dialog/habitat/datasets/object_nav/object_nav_dataset.py b/habitat-lab-dialog/habitat/datasets/object_nav/object_nav_dataset.py new file mode 100644 index 0000000..0d26ea1 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/object_nav/object_nav_dataset.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +from typing import Any, Dict, List, Optional, Sequence + +from habitat.config import Config +from habitat.core.registry import registry +from habitat.core.simulator import AgentState, ShortestPathPoint +from habitat.core.utils import DatasetFloatJSONEncoder +from habitat.datasets.pointnav.pointnav_dataset import ( + CONTENT_SCENES_PATH_FIELD, + DEFAULT_SCENE_PATH_PREFIX, + PointNavDatasetV1, +) +from habitat.tasks.nav.object_nav_task import ( + ObjectGoal, + ObjectGoalNavEpisode, + ObjectViewLocation, +) + + +@registry.register_dataset(name="ObjectNav-v1") +class ObjectNavDatasetV1(PointNavDatasetV1): + r"""Class inherited from PointNavDataset that loads Object Navigation dataset.""" + category_to_task_category_id: Dict[str, int] + category_to_scene_annotation_category_id: Dict[str, int] + episodes: List[ObjectGoalNavEpisode] = [] # type: ignore + content_scenes_path: str = "{data_path}/content/{scene}.json.gz" + goals_by_category: Dict[str, Sequence[ObjectGoal]] + + @staticmethod + def dedup_goals(dataset: Dict[str, Any]) -> Dict[str, Any]: + if len(dataset["episodes"]) == 0: + return dataset + + goals_by_category = {} + for i, ep in enumerate(dataset["episodes"]): + dataset["episodes"][i]["object_category"] = ep["goals"][0][ + "object_category" + ] + ep = ObjectGoalNavEpisode(**ep) + + goals_key = ep.goals_key + if goals_key not in goals_by_category: + goals_by_category[goals_key] = ep.goals + + dataset["episodes"][i]["goals"] = [] + + dataset["goals_by_category"] = goals_by_category + + return dataset + + def to_json(self) -> str: + for i in range(len(self.episodes)): + self.episodes[i].goals = [] + + result = DatasetFloatJSONEncoder().encode(self) + + for i in range(len(self.episodes)): + goals = self.goals_by_category[self.episodes[i].goals_key] + if not isinstance(goals, list): + goals = list(goals) + self.episodes[i].goals = goals + + return result + + def __init__(self, config: Optional[Config] = None) -> None: + self.goals_by_category = {} + super().__init__(config) + self.episodes = list(self.episodes) + + @staticmethod + def __deserialize_goal(serialized_goal: Dict[str, Any]) -> ObjectGoal: + g = ObjectGoal(**serialized_goal) + + for vidx, view in enumerate(g.view_points): + view_location = ObjectViewLocation(**view) # type: ignore + view_location.agent_state = AgentState(**view_location.agent_state) # type: ignore + g.view_points[vidx] = view_location + + return g + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None + ) -> None: + deserialized = json.loads(json_str) + if CONTENT_SCENES_PATH_FIELD in deserialized: + self.content_scenes_path = deserialized[CONTENT_SCENES_PATH_FIELD] + + if "category_to_task_category_id" in deserialized: + self.category_to_task_category_id = deserialized[ + "category_to_task_category_id" + ] + + if "category_to_scene_annotation_category_id" in deserialized: + self.category_to_scene_annotation_category_id = deserialized[ + "category_to_scene_annotation_category_id" + ] + + if "category_to_mp3d_category_id" in deserialized: + self.category_to_scene_annotation_category_id = deserialized[ + "category_to_mp3d_category_id" + ] + + assert len(self.category_to_task_category_id) == len( + self.category_to_scene_annotation_category_id + ) + + assert set(self.category_to_task_category_id.keys()) == set( + self.category_to_scene_annotation_category_id.keys() + ), "category_to_task and category_to_mp3d must have the same keys" + + if len(deserialized["episodes"]) == 0: + return + + if "goals_by_category" not in deserialized: + deserialized = self.dedup_goals(deserialized) + + for k, v in deserialized["goals_by_category"].items(): + self.goals_by_category[k] = [self.__deserialize_goal(g) for g in v] + + for i, episode in enumerate(deserialized["episodes"]): + episode = ObjectGoalNavEpisode(**episode) + episode.episode_id = str(i) + + if scenes_dir is not None: + if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): + episode.scene_id = episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX) : + ] + + episode.scene_id = os.path.join(scenes_dir, episode.scene_id) + + episode.goals = self.goals_by_category[episode.goals_key] + + if episode.shortest_paths is not None: + for path in episode.shortest_paths: + for p_index, point in enumerate(path): + if point is None or isinstance(point, (int, str)): + point = { + "action": point, + "rotation": None, + "position": None, + } + + path[p_index] = ShortestPathPoint(**point) + + self.episodes.append(episode) # type: ignore [attr-defined] diff --git a/habitat-lab-dialog/habitat/datasets/pointnav/__init__.py b/habitat-lab-dialog/habitat/datasets/pointnav/__init__.py new file mode 100644 index 0000000..e347e04 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/pointnav/__init__.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.dataset import Dataset +from habitat.core.registry import registry + + +# TODO(akadian): This is a result of moving SimulatorActions away from core +# and into simulators specifically. As a result of that the connection points +# for our tasks and datasets for actions is coming from inside habitat-sim +# which makes it impossible for anyone to use habitat-lab without having +# habitat-sim installed. In a future PR we will implement a base simulator +# action class which will be the connection point for tasks and datasets. +# Post that PR we would no longer need try register blocks. +def _try_register_pointnavdatasetv1(): + try: + from habitat.datasets.pointnav.pointnav_dataset import ( # noqa: F401 + PointNavDatasetV1, + ) + + except ImportError as e: + pointnav_import_error = e + + @registry.register_dataset(name="PointNav-v1") + class PointnavDatasetImportError(Dataset): + def __init__(self, *args, **kwargs): + raise pointnav_import_error diff --git a/habitat-lab-dialog/habitat/datasets/pointnav/pointnav_dataset.py b/habitat-lab-dialog/habitat/datasets/pointnav/pointnav_dataset.py new file mode 100644 index 0000000..9fff667 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/pointnav/pointnav_dataset.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gzip +import json +import os +from typing import List, Optional + +from habitat.config import Config +from habitat.core.dataset import ALL_SCENES_MASK, Dataset +from habitat.core.registry import registry +from habitat.tasks.nav.nav import ( + NavigationEpisode, + NavigationGoal, + ShortestPathPoint, +) + +CONTENT_SCENES_PATH_FIELD = "content_scenes_path" +DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/" + + +@registry.register_dataset(name="PointNav-v1") +class PointNavDatasetV1(Dataset): + r"""Class inherited from Dataset that loads Point Navigation dataset.""" + + episodes: List[NavigationEpisode] + content_scenes_path: str = "{data_path}/content/{scene}.json.gz" + + @staticmethod + def check_config_paths_exist(config: Config) -> bool: + return os.path.exists( + config.DATA_PATH.format(split=config.SPLIT) + ) and os.path.exists(config.SCENES_DIR) + + @classmethod + def get_scenes_to_load(cls, config: Config) -> List[str]: + r"""Return list of scene ids for which dataset has separate files with + episodes. + """ + assert cls.check_config_paths_exist(config) + dataset_dir = os.path.dirname( + config.DATA_PATH.format(split=config.SPLIT) + ) + + cfg = config.clone() + cfg.defrost() + cfg.CONTENT_SCENES = [] + dataset = cls(cfg) + has_individual_scene_files = os.path.exists( + dataset.content_scenes_path.split("{scene}")[0].format( + data_path=dataset_dir + ) + ) + if has_individual_scene_files: + return cls._get_scenes_from_folder( + content_scenes_path=dataset.content_scenes_path, + dataset_dir=dataset_dir, + ) + else: + # Load the full dataset, things are not split into separate files + cfg.CONTENT_SCENES = [ALL_SCENES_MASK] + dataset = cls(cfg) + return list(map(cls.scene_from_scene_path, dataset.scene_ids)) + + @staticmethod + def _get_scenes_from_folder( + content_scenes_path: str, dataset_dir: str + ) -> List[str]: + scenes: List[str] = [] + content_dir = content_scenes_path.split("{scene}")[0] + scene_dataset_ext = content_scenes_path.split("{scene}")[1] + content_dir = content_dir.format(data_path=dataset_dir) + if not os.path.exists(content_dir): + return scenes + + for filename in os.listdir(content_dir): + if filename.endswith(scene_dataset_ext): + scene = filename[: -len(scene_dataset_ext)] + scenes.append(scene) + scenes.sort() + return scenes + + def __init__(self, config: Optional[Config] = None) -> None: + self.episodes = [] + + if config is None: + return + + datasetfile_path = config.DATA_PATH.format(split=config.SPLIT) + with gzip.open(datasetfile_path, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR) + + # Read separate file for each scene + dataset_dir = os.path.dirname(datasetfile_path) + has_individual_scene_files = os.path.exists( + self.content_scenes_path.split("{scene}")[0].format( + data_path=dataset_dir + ) + ) + if has_individual_scene_files: + scenes = config.CONTENT_SCENES + if ALL_SCENES_MASK in scenes: + scenes = self._get_scenes_from_folder( + content_scenes_path=self.content_scenes_path, + dataset_dir=dataset_dir, + ) + + for scene in scenes: + scene_filename = self.content_scenes_path.format( + data_path=dataset_dir, scene=scene + ) + with gzip.open(scene_filename, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR) + + else: + self.episodes = list( + filter(self.build_content_scenes_filter(config), self.episodes) + ) + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None + ) -> None: + deserialized = json.loads(json_str) + if CONTENT_SCENES_PATH_FIELD in deserialized: + self.content_scenes_path = deserialized[CONTENT_SCENES_PATH_FIELD] + + for episode in deserialized["episodes"]: + episode = NavigationEpisode(**episode) + + if scenes_dir is not None: + if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): + episode.scene_id = episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX) : + ] + + episode.scene_id = os.path.join(scenes_dir, episode.scene_id) + + for g_index, goal in enumerate(episode.goals): + episode.goals[g_index] = NavigationGoal(**goal) + if episode.shortest_paths is not None: + for path in episode.shortest_paths: + for p_index, point in enumerate(path): + path[p_index] = ShortestPathPoint(**point) + self.episodes.append(episode) diff --git a/habitat-lab-dialog/habitat/datasets/pointnav/pointnav_generator.py b/habitat-lab-dialog/habitat/datasets/pointnav/pointnav_generator.py new file mode 100644 index 0000000..b81186f --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/pointnav/pointnav_generator.py @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r"""A minimum radius of a plane that a point should be part of to be +considered as a target or source location. Used to filter isolated points +that aren't part of a floor. +""" + +from typing import Dict, Generator, List, Optional, Sequence, Tuple, Union + +import numpy as np +from numpy import float64 + +from habitat.core.simulator import ShortestPathPoint +from habitat.datasets.utils import get_action_shortest_path +from habitat.tasks.nav.nav import NavigationEpisode, NavigationGoal + +try: + from habitat_sim.errors import GreedyFollowerError +except ImportError: + GreedyFollower = BaseException +try: + from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim +except ImportError: + habitat_sim = BaseException +ISLAND_RADIUS_LIMIT = 1.5 + + +def _ratio_sample_rate(ratio: float, ratio_threshold: float) -> float: + r"""Sampling function for aggressive filtering of straight-line + episodes with shortest path geodesic distance to Euclid distance ratio + threshold. + + :param ratio: geodesic distance ratio to Euclid distance + :param ratio_threshold: geodesic shortest path to Euclid + distance ratio upper limit till aggressive sampling is applied. + :return: value between 0.008 and 0.144 for ratio [1, 1.1] + """ + assert ratio < ratio_threshold + return 20 * (ratio - 0.98) ** 2 + + +def is_compatible_episode( + s: Sequence[float], + t: Sequence[float], + sim: "HabitatSim", + near_dist: float, + far_dist: float, + geodesic_to_euclid_ratio: float, +) -> Union[Tuple[bool, float], Tuple[bool, int]]: + euclid_dist = np.power(np.power(np.array(s) - np.array(t), 2).sum(0), 0.5) + if np.abs(s[1] - t[1]) > 0.5: # check height difference to assure s and + # t are from same floor + return False, 0 + d_separation = sim.geodesic_distance(s, [t]) + if d_separation == np.inf: + return False, 0 + if not near_dist <= d_separation <= far_dist: + return False, 0 + distances_ratio = d_separation / euclid_dist + if distances_ratio < geodesic_to_euclid_ratio and ( + np.random.rand() + > _ratio_sample_rate(distances_ratio, geodesic_to_euclid_ratio) + ): + return False, 0 + if sim.island_radius(s) < ISLAND_RADIUS_LIMIT: + return False, 0 + return True, d_separation + + +def _create_episode( + episode_id: Union[int, str], + scene_id: str, + start_position: List[float], + start_rotation: List[Union[int, float64]], + target_position: List[float], + shortest_paths: Optional[List[List[ShortestPathPoint]]] = None, + radius: Optional[float] = None, + info: Optional[Dict[str, float]] = None, +) -> Optional[NavigationEpisode]: + goals = [NavigationGoal(position=target_position, radius=radius)] + return NavigationEpisode( + episode_id=str(episode_id), + goals=goals, + scene_id=scene_id, + start_position=start_position, + start_rotation=start_rotation, + shortest_paths=shortest_paths, + info=info, + ) + + +def generate_pointnav_episode( + sim: "HabitatSim", + num_episodes: int = -1, + is_gen_shortest_path: bool = True, + shortest_path_success_distance: float = 0.2, + shortest_path_max_steps: int = 500, + closest_dist_limit: float = 1, + furthest_dist_limit: float = 30, + geodesic_to_euclid_min_ratio: float = 1.1, + number_retries_per_target: int = 10, +) -> Generator[NavigationEpisode, None, None]: + r"""Generator function that generates PointGoal navigation episodes. + + An episode is trivial if there is an obstacle-free, straight line between + the start and goal positions. A good measure of the navigation + complexity of an episode is the ratio of + geodesic shortest path position to Euclidean distance between start and + goal positions to the corresponding Euclidean distance. + If the ratio is nearly 1, it indicates there are few obstacles, and the + episode is easy; if the ratio is larger than 1, the + episode is difficult because strategic navigation is required. + To keep the navigation complexity of the precomputed episodes reasonably + high, we perform aggressive rejection sampling for episodes with the above + ratio falling in the range [1, 1.1]. + Following this, there is a significant decrease in the number of + straight-line episodes. + + + :param sim: simulator with loaded scene for generation. + :param num_episodes: number of episodes needed to generate + :param is_gen_shortest_path: option to generate shortest paths + :param shortest_path_success_distance: success distance when agent should + stop during shortest path generation + :param shortest_path_max_steps maximum number of steps shortest path + expected to be + :param closest_dist_limit episode geodesic distance lowest limit + :param furthest_dist_limit episode geodesic distance highest limit + :param geodesic_to_euclid_min_ratio geodesic shortest path to Euclid + distance ratio upper limit till aggressive sampling is applied. + :return: navigation episode that satisfy specified distribution for + currently loaded into simulator scene. + """ + episode_count = 0 + while episode_count < num_episodes or num_episodes < 0: + target_position = sim.sample_navigable_point() + + if sim.island_radius(target_position) < ISLAND_RADIUS_LIMIT: + continue + + for _retry in range(number_retries_per_target): + source_position = sim.sample_navigable_point() + + is_compatible, dist = is_compatible_episode( + source_position, + target_position, + sim, + near_dist=closest_dist_limit, + far_dist=furthest_dist_limit, + geodesic_to_euclid_ratio=geodesic_to_euclid_min_ratio, + ) + if is_compatible: + break + if is_compatible: + angle = np.random.uniform(0, 2 * np.pi) + source_rotation = [0, np.sin(angle / 2), 0, np.cos(angle / 2)] + + shortest_paths = None + if is_gen_shortest_path: + try: + shortest_paths = [ + get_action_shortest_path( + sim, + source_position=source_position, + source_rotation=source_rotation, + goal_position=target_position, + success_distance=shortest_path_success_distance, + max_episode_steps=shortest_path_max_steps, + ) + ] + # Throws an error when it can't find a path + except GreedyFollowerError: + continue + + episode = _create_episode( + episode_id=episode_count, + scene_id=sim.habitat_config.SCENE, + start_position=source_position, + start_rotation=source_rotation, + target_position=target_position, + shortest_paths=shortest_paths, + radius=shortest_path_success_distance, + info={"geodesic_distance": dist}, + ) + + episode_count += 1 + yield episode diff --git a/habitat-lab-dialog/habitat/datasets/registration.py b/habitat-lab-dialog/habitat/datasets/registration.py new file mode 100644 index 0000000..e3dd61e --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/registration.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.datasets.eqa import _try_register_mp3d_eqa_dataset +from habitat.datasets.object_nav import _try_register_objectnavdatasetv1 +from habitat.datasets.pointnav import _try_register_pointnavdatasetv1 +from habitat.datasets.vln import _try_register_r2r_vln_dataset + +def make_dataset(id_dataset, **kwargs): + logger.info("Initializing dataset {}".format(id_dataset)) + _dataset = registry.get_dataset(id_dataset) + assert _dataset is not None, "Could not find dataset {}".format(id_dataset) + + return _dataset(**kwargs) # type: ignore + + +_try_register_objectnavdatasetv1() +_try_register_mp3d_eqa_dataset() +_try_register_pointnavdatasetv1() +_try_register_r2r_vln_dataset() diff --git a/habitat-lab-dialog/habitat/datasets/utils.py b/habitat-lab-dialog/habitat/datasets/utils.py new file mode 100644 index 0000000..6007a65 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/utils.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +""" + Tokenize and vocabulary utils originally authored by @apsdehal and are + taken from Pythia. +""" +import re +import typing +from collections import Counter +from typing import Iterable, List, Union + +from numpy import float64 + +from habitat.core.logging import logger +from habitat.core.simulator import ShortestPathPoint +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower +from habitat.utils.geometry_utils import quaternion_to_list + +try: + from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim +except ImportError: + pass + +SENTENCE_SPLIT_REGEX = re.compile(r"([^\w-]+)") + + +def tokenize( + sentence, regex=SENTENCE_SPLIT_REGEX, keep=("'s"), remove=(",", "?") +) -> List[str]: + sentence = sentence.lower() + + for token in keep: + sentence = sentence.replace(token, " " + token) + + for token in remove: + sentence = sentence.replace(token, "") + + tokens = regex.split(sentence) + tokens = [t.strip() for t in tokens if len(t.strip()) > 0] + return tokens + + +def load_str_list(fname): + with open(fname) as f: + lines = f.readlines() + lines = [l.strip() for l in lines] + return lines + + +class VocabDict: + UNK_TOKEN = "" + PAD_TOKEN = "" + START_TOKEN = "" + END_TOKEN = "" + + def __init__(self, word_list=None, filepath=None): + if word_list is not None: + self.word_list = word_list + self._build() + + elif filepath: + self.word_list = load_str_list(filepath) + self._build() + + def _build(self): + if self.UNK_TOKEN not in self.word_list: + self.word_list = [self.UNK_TOKEN] + self.word_list + + self.word2idx_dict = {w: n_w for n_w, w in enumerate(self.word_list)} + + # String (word) to integer (index) dict mapping + self.stoi = self.word2idx_dict + # Integer to string (word) reverse mapping + self.itos = self.word_list + self.num_vocab = len(self.word_list) + + self.UNK_INDEX = ( + self.word2idx_dict[self.UNK_TOKEN] + if self.UNK_TOKEN in self.word2idx_dict + else None + ) + + self.PAD_INDEX = ( + self.word2idx_dict[self.PAD_TOKEN] + if self.PAD_TOKEN in self.word2idx_dict + else None + ) + + def idx2word(self, n_w): + return self.word_list[n_w] + + def token_idx_2_string(self, tokens: Iterable[int]) -> str: + q_string = "" + for token in tokens: + if token != 0: + q_string += self.idx2word(token) + " " + + q_string += "?" + return q_string + + def __len__(self): + return len(self.word_list) + + def get_size(self): + return len(self.word_list) + + def get_unk_index(self): + return self.UNK_INDEX + + def get_unk_token(self): + return self.UNK_TOKEN + + def word2idx(self, w): + if w in self.word2idx_dict: + return self.word2idx_dict[w] + elif self.UNK_INDEX is not None: + return self.UNK_INDEX + else: + raise ValueError( + "word %s not in dictionary \ + (while dictionary does not contain )" + % w + ) + + def tokenize_and_index( + self, + sentence, + regex=SENTENCE_SPLIT_REGEX, + keep=("'s"), + remove=(",", "?"), + ) -> List[int]: + inds = [ + self.word2idx(w) + for w in tokenize(sentence, regex=regex, keep=keep, remove=remove) + ] + return inds + + +class VocabFromText(VocabDict): + DEFAULT_TOKENS = [ + VocabDict.PAD_TOKEN, + VocabDict.UNK_TOKEN, + VocabDict.START_TOKEN, + VocabDict.END_TOKEN, + ] + + def __init__( + self, + sentences, + min_count=1, + regex=SENTENCE_SPLIT_REGEX, + keep=(), + remove=(), + only_unk_extra=False, + ): + token_counter: typing.Counter[str] = Counter() + + for sentence in sentences: + tokens = tokenize(sentence, regex=regex, keep=keep, remove=remove) + token_counter.update(tokens) + + token_list = [] + for token in token_counter: + if token_counter[token] >= min_count: + token_list.append(token) + + extras = self.DEFAULT_TOKENS + + if only_unk_extra: + extras = [self.UNK_TOKEN] + + super(VocabFromText, self).__init__(word_list=extras + token_list) + + +def get_action_shortest_path( + sim: "HabitatSim", + source_position: List[float], + source_rotation: List[Union[int, float64]], + goal_position: List[float], + success_distance: float = 0.05, + max_episode_steps: int = 500, +) -> List[ShortestPathPoint]: + sim.reset() + sim.set_agent_state(source_position, source_rotation) + follower = ShortestPathFollower(sim, success_distance, False) + + shortest_path = [] + step_count = 0 + action = follower.get_next_action(goal_position) + while ( + action is not HabitatSimActions.STOP and step_count < max_episode_steps + ): + state = sim.get_agent_state() + shortest_path.append( + ShortestPathPoint( + state.position.tolist(), + quaternion_to_list(state.rotation), + action, + ) + ) + sim.step(action) + step_count += 1 + action = follower.get_next_action(goal_position) + + if step_count == max_episode_steps: + logger.warning("Shortest path wasn't found.") + return shortest_path diff --git a/habitat-lab-dialog/habitat/datasets/vln/__init__.py b/habitat-lab-dialog/habitat/datasets/vln/__init__.py new file mode 100644 index 0000000..3d94cd4 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/vln/__init__.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.dataset import Dataset +from habitat.core.registry import registry + + +def _try_register_r2r_vln_dataset(): + try: + from habitat.datasets.vln.r2r_vln_dataset import ( # noqa: F401 isort:skip + VLNDatasetV1, + ) + except ImportError as e: + r2r_vln_import_error = e + + @registry.register_dataset(name="R2RVLN-v1") + class R2RDatasetImportError(Dataset): + def __init__(self, *args, **kwargs): + raise r2r_vln_import_error diff --git a/habitat-lab-dialog/habitat/datasets/vln/r2r_vln_dataset.py b/habitat-lab-dialog/habitat/datasets/vln/r2r_vln_dataset.py new file mode 100644 index 0000000..e1c96d3 --- /dev/null +++ b/habitat-lab-dialog/habitat/datasets/vln/r2r_vln_dataset.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gzip +import json +import os +from typing import List, Optional + +from habitat.config import Config +from habitat.core.dataset import Dataset +from habitat.core.registry import registry +from habitat.datasets.utils import VocabDict +from habitat.tasks.nav.nav import NavigationGoal +from habitat.tasks.vln.vln import InstructionData, VLNEpisode + +DEFAULT_SCENE_PATH_PREFIX = "data/scene_datasets/" + + +@registry.register_dataset(name="R2RVLN-v1") +class VLNDatasetV1(Dataset): + r"""Class inherited from Dataset that loads a Vision and Language + Navigation dataset. + """ + + episodes: List[VLNEpisode] + instruction_vocab: VocabDict + + @staticmethod + def check_config_paths_exist(config: Config) -> bool: + return os.path.exists( + config.DATA_PATH.format(split=config.SPLIT) + ) and os.path.exists(config.SCENES_DIR) + + def __init__(self, config: Optional[Config] = None) -> None: + self.episodes = [] + + if config is None: + return + + dataset_filename = config.DATA_PATH.format(split=config.SPLIT) + with gzip.open(dataset_filename, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR) + + self.episodes = list( + filter(self.build_content_scenes_filter(config), self.episodes) + ) + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None + ) -> None: + + deserialized = json.loads(json_str) + self.instruction_vocab = VocabDict( + word_list=deserialized["instruction_vocab"]["word_list"] + ) + + for episode in deserialized["episodes"]: + episode = VLNEpisode(**episode) + + if scenes_dir is not None: + if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): + episode.scene_id = episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX) : + ] + + episode.scene_id = os.path.join(scenes_dir, episode.scene_id) + + episode.instruction = InstructionData(**episode.instruction) + for g_index, goal in enumerate(episode.goals): + episode.goals[g_index] = NavigationGoal(**goal) + self.episodes.append(episode) diff --git a/habitat-lab-dialog/habitat/py.typed b/habitat-lab-dialog/habitat/py.typed new file mode 100644 index 0000000..abe48a5 --- /dev/null +++ b/habitat-lab-dialog/habitat/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561. This tells mypy that the package uses inline types. diff --git a/habitat-lab-dialog/habitat/sims/__init__.py b/habitat-lab-dialog/habitat/sims/__init__.py new file mode 100644 index 0000000..c450f1c --- /dev/null +++ b/habitat-lab-dialog/habitat/sims/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.sims.registration import make_sim # noqa: F401 diff --git a/habitat-lab-dialog/habitat/sims/habitat_simulator/__init__.py b/habitat-lab-dialog/habitat/sims/habitat_simulator/__init__.py new file mode 100644 index 0000000..7facf8a --- /dev/null +++ b/habitat-lab-dialog/habitat/sims/habitat_simulator/__init__.py @@ -0,0 +1,27 @@ +from habitat.core.registry import registry +from habitat.core.simulator import Simulator + +# from habitat.sims.habitat_simulator.actions import ( +# HabitatSimV1ActionSpaceConfiguration, +# ) + + +def _try_register_habitat_sim(): + try: + import habitat_sim # noqa: F401 + + has_habitat_sim = True + except ImportError as e: + has_habitat_sim = False + habitat_sim_import_error = e + + if has_habitat_sim: + from habitat.sims.habitat_simulator.actions import ( # noqa: F401 + HabitatSimV1ActionSpaceConfiguration, + ) + else: + + @registry.register_simulator(name="Sim-v0") + class HabitatSimImportError(Simulator): + def __init__(self, *args, **kwargs): + raise habitat_sim_import_error diff --git a/habitat-lab-dialog/habitat/sims/habitat_simulator/actions.py b/habitat-lab-dialog/habitat/sims/habitat_simulator/actions.py new file mode 100644 index 0000000..3e029f4 --- /dev/null +++ b/habitat-lab-dialog/habitat/sims/habitat_simulator/actions.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from enum import Enum +from typing import Dict + +import attr + +import habitat_sim +from habitat.core.registry import registry +from habitat.core.simulator import ActionSpaceConfiguration +from habitat.core.utils import Singleton + + +class _DefaultHabitatSimActions(Enum): + STOP = 0 + MOVE_FORWARD = 1 + TURN_LEFT = 2 + TURN_RIGHT = 3 + LOOK_UP = 4 + LOOK_DOWN = 5 + # QUERY = 9 + + +@attr.s(auto_attribs=True, slots=True) +class HabitatSimActionsSingleton(metaclass=Singleton): + r"""Implements an extendable Enum for the mapping of action names + to their integer values. + + This means that new action names can be added, but old action names cannot + be removed nor can their mapping be altered. This also ensures that all + actions are always contigously mapped in :py:`[0, len(HabitatSimActions) - 1]` + + This accesible as the global singleton :ref:`HabitatSimActions` + """ + + _known_actions: Dict[str, int] = attr.ib(init=False, factory=dict) + + def __attrs_post_init__(self): + for action in _DefaultHabitatSimActions: + self._known_actions[action.name] = action.value + + def extend_action_space(self, name: str) -> int: + r"""Extends the action space to accomodate a new action with + the name :p:`name` + + :param name: The name of the new action + :return: The number the action is registered on + + Usage: + + .. code:: py + + from habitat.sims.habitat_simulator.actions import HabitatSimActions + HabitatSimActions.extend_action_space("MY_ACTION") + print(HabitatSimActions.MY_ACTION) + """ + assert ( + name not in self._known_actions + ), "Cannot register an action name twice" + self._known_actions[name] = len(self._known_actions) + + return self._known_actions[name] + + def has_action(self, name: str) -> bool: + r"""Checks to see if action :p:`name` is already register + + :param name: The name to check + :return: Whether or not :p:`name` already exists + """ + + return name in self._known_actions + + def __getattr__(self, name): + return self._known_actions[name] + + def __getitem__(self, name): + return self._known_actions[name] + + def __len__(self): + return len(self._known_actions) + + def __iter__(self): + return iter(self._known_actions) + + +HabitatSimActions: HabitatSimActionsSingleton = HabitatSimActionsSingleton() + + +@registry.register_action_space_configuration(name="v0") +class HabitatSimV0ActionSpaceConfiguration(ActionSpaceConfiguration): + def get(self): + return { + HabitatSimActions.STOP: habitat_sim.ActionSpec("stop"), + HabitatSimActions.MOVE_FORWARD: habitat_sim.ActionSpec( + "move_forward", + habitat_sim.ActuationSpec( + amount=self.config.FORWARD_STEP_SIZE + ), + ), + HabitatSimActions.TURN_LEFT: habitat_sim.ActionSpec( + "turn_left", + habitat_sim.ActuationSpec(amount=self.config.TURN_ANGLE), + ), + HabitatSimActions.TURN_RIGHT: habitat_sim.ActionSpec( + "turn_right", + habitat_sim.ActuationSpec(amount=self.config.TURN_ANGLE), + ), + } + + +@registry.register_action_space_configuration(name="v1") +class HabitatSimV1ActionSpaceConfiguration( + HabitatSimV0ActionSpaceConfiguration +): + def get(self): + config = super().get() + new_config = { + HabitatSimActions.LOOK_UP: habitat_sim.ActionSpec( + "look_up", + habitat_sim.ActuationSpec(amount=self.config.TILT_ANGLE), + ), + HabitatSimActions.LOOK_DOWN: habitat_sim.ActionSpec( + "look_down", + habitat_sim.ActuationSpec(amount=self.config.TILT_ANGLE), + ), + } + + config.update(new_config) + + return config + + +@registry.register_action_space_configuration(name="pyrobotnoisy") +class HabitatSimPyRobotActionSpaceConfiguration(ActionSpaceConfiguration): + def get(self): + return { + HabitatSimActions.STOP: habitat_sim.ActionSpec("stop"), + HabitatSimActions.MOVE_FORWARD: habitat_sim.ActionSpec( + "pyrobot_noisy_move_forward", + habitat_sim.PyRobotNoisyActuationSpec( + amount=self.config.FORWARD_STEP_SIZE, + robot=self.config.NOISE_MODEL.ROBOT, + controller=self.config.NOISE_MODEL.CONTROLLER, + noise_multiplier=self.config.NOISE_MODEL.NOISE_MULTIPLIER, + ), + ), + HabitatSimActions.TURN_LEFT: habitat_sim.ActionSpec( + "pyrobot_noisy_turn_left", + habitat_sim.PyRobotNoisyActuationSpec( + amount=self.config.TURN_ANGLE, + robot=self.config.NOISE_MODEL.ROBOT, + controller=self.config.NOISE_MODEL.CONTROLLER, + noise_multiplier=self.config.NOISE_MODEL.NOISE_MULTIPLIER, + ), + ), + HabitatSimActions.TURN_RIGHT: habitat_sim.ActionSpec( + "pyrobot_noisy_turn_right", + habitat_sim.PyRobotNoisyActuationSpec( + amount=self.config.TURN_ANGLE, + robot=self.config.NOISE_MODEL.ROBOT, + controller=self.config.NOISE_MODEL.CONTROLLER, + noise_multiplier=self.config.NOISE_MODEL.NOISE_MULTIPLIER, + ), + ), + HabitatSimActions.LOOK_UP: habitat_sim.ActionSpec( + "look_up", + habitat_sim.ActuationSpec(amount=self.config.TILT_ANGLE), + ), + HabitatSimActions.LOOK_DOWN: habitat_sim.ActionSpec( + "look_down", + habitat_sim.ActuationSpec(amount=self.config.TILT_ANGLE), + ), + # The perfect actions are needed for the oracle planner + "_forward": habitat_sim.ActionSpec( + "move_forward", + habitat_sim.ActuationSpec( + amount=self.config.FORWARD_STEP_SIZE + ), + ), + "_left": habitat_sim.ActionSpec( + "turn_left", + habitat_sim.ActuationSpec(amount=self.config.TURN_ANGLE), + ), + "_right": habitat_sim.ActionSpec( + "turn_right", + habitat_sim.ActuationSpec(amount=self.config.TURN_ANGLE), + ), + } diff --git a/habitat-lab-dialog/habitat/sims/habitat_simulator/habitat_simulator.py b/habitat-lab-dialog/habitat/sims/habitat_simulator/habitat_simulator.py new file mode 100644 index 0000000..f9f276d --- /dev/null +++ b/habitat-lab-dialog/habitat/sims/habitat_simulator/habitat_simulator.py @@ -0,0 +1,576 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Sequence, + Set, + Union, + cast, +) + +import numpy as np +from gym import spaces +from gym.spaces.box import Box +from numpy import ndarray + +if TYPE_CHECKING: + from torch import Tensor + +import habitat_sim +from habitat.core.dataset import Episode +from habitat.core.registry import registry +from habitat.core.simulator import ( + AgentState, + Config, + DepthSensor, + Observations, + RGBSensor, + SemanticSensor, + Sensor, + SensorSuite, + ShortestPathPoint, + Simulator, + VisualObservation, +) +from habitat.core.spaces import Space + +RGBSENSOR_DIMENSION = 3 + + +def overwrite_config( + config_from: Config, config_to: Any, ignore_keys: Optional[Set[str]] = None +) -> None: + r"""Takes Habitat Lab config and Habitat-Sim config structures. Overwrites + Habitat-Sim config with Habitat Lab values, where a field name is present + in lowercase. Mostly used to avoid :ref:`sim_cfg.field = hapi_cfg.FIELD` + code. + Args: + config_from: Habitat Lab config node. + config_to: Habitat-Sim config structure. + ignore_keys: Optional set of keys to ignore in config_to + """ + + def if_config_to_lower(config): + if isinstance(config, Config): + return {key.lower(): val for key, val in config.items()} + else: + return config + + for attr, value in config_from.items(): + low_attr = attr.lower() + if ignore_keys is None or low_attr not in ignore_keys: + if hasattr(config_to, low_attr): + setattr(config_to, low_attr, if_config_to_lower(value)) + else: + raise NameError( + f"""{low_attr} is not found on habitat_sim but is found on habitat_lab config. + It's also not in the list of keys to ignore: {ignore_keys} + Did you make a typo in the config? + If not the version of Habitat Sim may not be compatible with Habitat Lab version: {config_from} + """ + ) + + +@registry.register_sensor +class HabitatSimRGBSensor(RGBSensor): + sim_sensor_type: habitat_sim.SensorType + + def __init__(self, config: Config) -> None: + self.sim_sensor_type = habitat_sim.SensorType.COLOR + super().__init__(config=config) + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Box: + return spaces.Box( + low=0, + high=255, + shape=(self.config.HEIGHT, self.config.WIDTH, RGBSENSOR_DIMENSION), + dtype=np.uint8, + ) + + def get_observation( + self, sim_obs: Dict[str, Union[ndarray, bool, "Tensor"]] + ) -> VisualObservation: + obs = cast(Optional[VisualObservation], sim_obs.get(self.uuid, None)) + check_sim_obs(obs, self) + + # remove alpha channel + obs = obs[:, :, :RGBSENSOR_DIMENSION] # type: ignore[index] + return obs + + +@registry.register_sensor +class HabitatSimDepthSensor(DepthSensor): + sim_sensor_type: habitat_sim.SensorType + min_depth_value: float + max_depth_value: float + + def __init__(self, config: Config) -> None: + self.sim_sensor_type = habitat_sim.SensorType.DEPTH + + if config.NORMALIZE_DEPTH: + self.min_depth_value = 0 + self.max_depth_value = 1 + else: + self.min_depth_value = config.MIN_DEPTH + self.max_depth_value = config.MAX_DEPTH + + super().__init__(config=config) + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Box: + return spaces.Box( + low=self.min_depth_value, + high=self.max_depth_value, + shape=(self.config.HEIGHT, self.config.WIDTH, 1), + dtype=np.float32, + ) + + def get_observation( + self, sim_obs: Dict[str, Union[ndarray, bool, "Tensor"]] + ) -> VisualObservation: + obs = cast(Optional[VisualObservation], sim_obs.get(self.uuid, None)) + check_sim_obs(obs, self) + if isinstance(obs, np.ndarray): + obs = np.clip(obs, self.config.MIN_DEPTH, self.config.MAX_DEPTH) + + obs = np.expand_dims( + obs, axis=2 + ) # make depth observation a 3D array + else: + obs = obs.clamp(self.config.MIN_DEPTH, self.config.MAX_DEPTH) # type: ignore[attr-defined] + + obs = obs.unsqueeze(-1) # type: ignore[attr-defined] + + if self.config.NORMALIZE_DEPTH: + # normalize depth observation to [0, 1] + obs = (obs - self.config.MIN_DEPTH) / ( + self.config.MAX_DEPTH - self.config.MIN_DEPTH + ) + + return obs + + +@registry.register_sensor +class HabitatSimSemanticSensor(SemanticSensor): + sim_sensor_type: habitat_sim.SensorType + + def __init__(self, config): + self.sim_sensor_type = habitat_sim.SensorType.SEMANTIC + super().__init__(config=config) + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=np.iinfo(np.uint32).min, + high=np.iinfo(np.uint32).max, + shape=(self.config.HEIGHT, self.config.WIDTH), + dtype=np.uint32, + ) + + def get_observation( + self, sim_obs: Dict[str, Union[ndarray, bool, "Tensor"]] + ) -> VisualObservation: + obs = cast(Optional[VisualObservation], sim_obs.get(self.uuid, None)) + check_sim_obs(obs, self) + return obs + + +def check_sim_obs(obs: ndarray, sensor: Sensor) -> None: + assert obs is not None, ( + "Observation corresponding to {} not present in " + "simulator's observations".format(sensor.uuid) + ) + + +HabitatSimVizSensors = Union[ + HabitatSimRGBSensor, HabitatSimDepthSensor, HabitatSimSemanticSensor +] + + +@registry.register_simulator(name="Sim-v0") +class HabitatSim(habitat_sim.Simulator, Simulator): + r"""Simulator wrapper over habitat-sim + + habitat-sim repo: https://github.com/facebookresearch/habitat-sim + + Args: + config: configuration for initializing the simulator. + """ + + def __init__(self, config: Config) -> None: + self.habitat_config = config + agent_config = self._get_agent_config() + + sim_sensors = [] + for sensor_name in agent_config.SENSORS: + sensor_cfg = getattr(self.habitat_config, sensor_name) + sensor_type = registry.get_sensor(sensor_cfg.TYPE) + + assert sensor_type is not None, "invalid sensor type {}".format( + sensor_cfg.TYPE + ) + sim_sensors.append(sensor_type(sensor_cfg)) + + self._sensor_suite = SensorSuite(sim_sensors) + self.sim_config = self.create_sim_config(self._sensor_suite) + self._current_scene = self.sim_config.sim_cfg.scene_id + super().__init__(self.sim_config) + self._action_space = spaces.Discrete( + len(self.sim_config.agents[0].action_space) + ) + self._prev_sim_obs: Optional[Observations] = None + + def create_sim_config( + self, _sensor_suite: SensorSuite + ) -> habitat_sim.Configuration: + sim_config = habitat_sim.SimulatorConfiguration() + # Check if Habitat-Sim is post Scene Config Update + if not hasattr(sim_config, "scene_id"): + raise RuntimeError( + "Incompatible version of Habitat-Sim detected, please upgrade habitat_sim" + ) + overwrite_config( + config_from=self.habitat_config.HABITAT_SIM_V0, + config_to=sim_config, + # Ignore key as it gets propogated to sensor below + ignore_keys={"gpu_gpu"}, + ) + sim_config.scene_id = self.habitat_config.SCENE + agent_config = habitat_sim.AgentConfiguration() + overwrite_config( + config_from=self._get_agent_config(), + config_to=agent_config, + # These keys are only used by Hab-Lab + ignore_keys={ + "is_set_start_state", + # This is the Sensor Config. Unpacked below + "sensors", + "start_position", + "start_rotation", + }, + ) + + sensor_specifications = [] + for sensor in _sensor_suite.sensors.values(): + sim_sensor_cfg = habitat_sim.SensorSpec() + # TODO Handle configs for custom VisualSensors that might need + # their own ignore_keys. Maybe with special key / checking + # SensorType + overwrite_config( + config_from=sensor.config, + config_to=sim_sensor_cfg, + # These keys are only used by Hab-Lab + # or translated into the sensor config manually + ignore_keys={ + "height", + "hfov", + "max_depth", + "min_depth", + "normalize_depth", + "type", + "width", + }, + ) + sim_sensor_cfg.uuid = sensor.uuid + sim_sensor_cfg.resolution = list( + sensor.observation_space.shape[:2] + ) + sim_sensor_cfg.parameters["hfov"] = str(sensor.config.HFOV) + + # TODO(maksymets): Add configure method to Sensor API to avoid + # accessing child attributes through parent interface + # We know that the Sensor has to be one of these Sensors + sensor = cast(HabitatSimVizSensors, sensor) + sim_sensor_cfg.sensor_type = sensor.sim_sensor_type + sim_sensor_cfg.gpu2gpu_transfer = ( + self.habitat_config.HABITAT_SIM_V0.GPU_GPU + ) + sensor_specifications.append(sim_sensor_cfg) + + agent_config.sensor_specifications = sensor_specifications + agent_config.action_space = registry.get_action_space_configuration( + self.habitat_config.ACTION_SPACE_CONFIG + )(self.habitat_config).get() + + return habitat_sim.Configuration(sim_config, [agent_config]) + + @property + def sensor_suite(self) -> SensorSuite: + return self._sensor_suite + + @property + def action_space(self) -> Space: + return self._action_space + + def _update_agents_state(self) -> bool: + is_updated = False + for agent_id, _ in enumerate(self.habitat_config.AGENTS): + agent_cfg = self._get_agent_config(agent_id) + if agent_cfg.IS_SET_START_STATE: + self.set_agent_state( + agent_cfg.START_POSITION, + agent_cfg.START_ROTATION, + agent_id, + ) + is_updated = True + + return is_updated + + def reset(self) -> Observations: + sim_obs = super().reset() + if self._update_agents_state(): + sim_obs = self.get_sensor_observations() + + self._prev_sim_obs = sim_obs + return self._sensor_suite.get_observations(sim_obs) + + def step(self, action: Union[str, int]) -> Observations: + sim_obs = super().step(action) + self._prev_sim_obs = sim_obs + observations = self._sensor_suite.get_observations(sim_obs) + return observations + + def render(self, mode: str = "rgb") -> Any: + r""" + Args: + mode: sensor whose observation is used for returning the frame, + eg: "rgb", "depth", "semantic" + + Returns: + rendered frame according to the mode + """ + sim_obs = self.get_sensor_observations() + observations = self._sensor_suite.get_observations(sim_obs) + + output = observations.get(mode) + assert output is not None, "mode {} sensor is not active".format(mode) + if not isinstance(output, np.ndarray): + # If it is not a numpy array, it is a torch tensor + # The function expects the result to be a numpy array + output = output.to("cpu").numpy() + + return output + + def reconfigure(self, habitat_config: Config) -> None: + # TODO(maksymets): Switch to Habitat-Sim more efficient caching + is_same_scene = habitat_config.SCENE == self._current_scene + self.habitat_config = habitat_config + self.sim_config = self.create_sim_config(self._sensor_suite) + if not is_same_scene: + self._current_scene = habitat_config.SCENE + self.close() + super().reconfigure(self.sim_config) + + self._update_agents_state() + + def geodesic_distance( + self, + position_a: Union[Sequence[float], ndarray], + position_b: Union[Sequence[float], Sequence[Sequence[float]]], + episode: Optional[Episode] = None, + ) -> float: + if episode is None or episode._shortest_path_cache is None: + path = habitat_sim.MultiGoalShortestPath() + if isinstance(position_b[0], (Sequence, np.ndarray)): + path.requested_ends = np.array(position_b, dtype=np.float32) + else: + path.requested_ends = np.array( + [np.array(position_b, dtype=np.float32)] + ) + else: + path = episode._shortest_path_cache + + path.requested_start = np.array(position_a, dtype=np.float32) + + self.pathfinder.find_path(path) + + if episode is not None: + episode._shortest_path_cache = path + + return path.geodesic_distance + + def action_space_shortest_path( + self, + source: AgentState, + targets: Sequence[AgentState], + agent_id: int = 0, + ) -> List[ShortestPathPoint]: + r""" + Returns: + List of agent states and actions along the shortest path from + source to the nearest target (both included). If one of the + target(s) is identical to the source, a list containing only + one node with the identical agent state is returned. Returns + an empty list in case none of the targets are reachable from + the source. For the last item in the returned list the action + will be None. + """ + raise NotImplementedError( + "This function is no longer implemented. Please use the greedy " + "follower instead" + ) + + @property + def up_vector(self) -> np.ndarray: + return np.array([0.0, 1.0, 0.0]) + + @property + def forward_vector(self) -> np.ndarray: + return -np.array([0.0, 0.0, 1.0]) + + def get_straight_shortest_path_points(self, position_a, position_b): + path = habitat_sim.ShortestPath() + path.requested_start = position_a + path.requested_end = position_b + self.pathfinder.find_path(path) + return path.points + + def sample_navigable_point(self) -> List[float]: + return self.pathfinder.get_random_navigable_point().tolist() + + def is_navigable(self, point: List[float]) -> bool: + return self.pathfinder.is_navigable(point) + + def semantic_annotations(self): + r""" + Returns: + SemanticScene which is a three level hierarchy of semantic + annotations for the current scene. Specifically this method + returns a SemanticScene which contains a list of SemanticLevel's + where each SemanticLevel contains a list of SemanticRegion's where + each SemanticRegion contains a list of SemanticObject's. + + SemanticScene has attributes: aabb(axis-aligned bounding box) which + has attributes aabb.center and aabb.sizes which are 3d vectors, + categories, levels, objects, regions. + + SemanticLevel has attributes: id, aabb, objects and regions. + + SemanticRegion has attributes: id, level, aabb, category (to get + name of category use category.name()) and objects. + + SemanticObject has attributes: id, region, aabb, obb (oriented + bounding box) and category. + + SemanticScene contains List[SemanticLevels] + SemanticLevel contains List[SemanticRegion] + SemanticRegion contains List[SemanticObject] + + Example to loop through in a hierarchical fashion: + for level in semantic_scene.levels: + for region in level.regions: + for obj in region.objects: + """ + return self.semantic_scene + + def _get_agent_config(self, agent_id: Optional[int] = None) -> Any: + if agent_id is None: + agent_id = self.habitat_config.DEFAULT_AGENT_ID + agent_name = self.habitat_config.AGENTS[agent_id] + agent_config = getattr(self.habitat_config, agent_name) + return agent_config + + def get_agent_state(self, agent_id: int = 0) -> habitat_sim.AgentState: + assert agent_id == 0, "No support of multi agent in {} yet.".format( + self.__class__.__name__ + ) + return self.get_agent(agent_id).get_state() + + def set_agent_state( + self, + position: List[float], + rotation: List[float], + agent_id: int = 0, + reset_sensors: bool = True, + ) -> bool: + r"""Sets agent state similar to initialize_agent, but without agents + creation. On failure to place the agent in the proper position, it is + moved back to its previous pose. + + Args: + position: list containing 3 entries for (x, y, z). + rotation: list with 4 entries for (x, y, z, w) elements of unit + quaternion (versor) representing agent 3D orientation, + (https://en.wikipedia.org/wiki/Versor) + agent_id: int identification of agent from multiagent setup. + reset_sensors: bool for if sensor changes (e.g. tilt) should be + reset). + + Returns: + True if the set was successful else moves the agent back to its + original pose and returns false. + """ + agent = self.get_agent(agent_id) + new_state = self.get_agent_state(agent_id) + new_state.position = position + new_state.rotation = rotation + + # NB: The agent state also contains the sensor states in _absolute_ + # coordinates. In order to set the agent's body to a specific + # location and have the sensors follow, we must not provide any + # state for the sensors. This will cause them to follow the agent's + # body + new_state.sensor_states = {} + agent.set_state(new_state, reset_sensors) + return True + + def get_observations_at( + self, + position: Optional[List[float]] = None, + rotation: Optional[List[float]] = None, + keep_agent_at_new_pose: bool = False, + ) -> Optional[Observations]: + current_state = self.get_agent_state() + if position is None or rotation is None: + success = True + else: + success = self.set_agent_state( + position, rotation, reset_sensors=False + ) + + if success: + sim_obs = self.get_sensor_observations() + + self._prev_sim_obs = sim_obs + + observations = self._sensor_suite.get_observations(sim_obs) + if not keep_agent_at_new_pose: + self.set_agent_state( + current_state.position, + current_state.rotation, + reset_sensors=False, + ) + return observations + else: + return None + + def distance_to_closest_obstacle( + self, position: ndarray, max_search_radius: float = 2.0 + ) -> float: + return self.pathfinder.distance_to_closest_obstacle( + position, max_search_radius + ) + + def island_radius(self, position: Sequence[float]) -> float: + return self.pathfinder.island_radius(position) + + @property + def previous_step_collided(self): + r"""Whether or not the previous step resulted in a collision + + Returns: + bool: True if the previous step resulted in a collision, false otherwise + + Warning: + This feild is only updated when :meth:`step`, :meth:`reset`, or :meth:`get_observations_at` are + called. It does not update when the agent is moved to a new loction. Furthermore, it + will _always_ be false after :meth:`reset` or :meth:`get_observations_at` as neither of those + result in an action (step) being taken. + """ + return self._prev_sim_obs.get("collided", False) diff --git a/habitat-lab-dialog/habitat/sims/pyrobot/__init__.py b/habitat-lab-dialog/habitat/sims/pyrobot/__init__.py new file mode 100644 index 0000000..8106f7b --- /dev/null +++ b/habitat-lab-dialog/habitat/sims/pyrobot/__init__.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.registry import registry +from habitat.core.simulator import Simulator + + +def _try_register_pyrobot(): + try: + import pyrobot # noqa: F401 + + has_pyrobot = True + except ImportError as e: + has_pyrobot = False + pyrobot_import_error = e + + if has_pyrobot: + from habitat.sims.pyrobot.pyrobot import PyRobot # noqa: F401 + else: + + @registry.register_simulator(name="PyRobot-v0") + class PyRobotImportError(Simulator): + def __init__(self, *args, **kwargs): + raise pyrobot_import_error diff --git a/habitat-lab-dialog/habitat/sims/pyrobot/pyrobot.py b/habitat-lab-dialog/habitat/sims/pyrobot/pyrobot.py new file mode 100644 index 0000000..13b10db --- /dev/null +++ b/habitat-lab-dialog/habitat/sims/pyrobot/pyrobot.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any + +import numpy as np +import pyrobot +from gym import Space, spaces + +from habitat.core.registry import registry +from habitat.core.simulator import ( + BumpSensor, + Config, + DepthSensor, + RGBSensor, + SensorSuite, + Simulator, +) +from habitat.core.utils import center_crop, try_cv2_import + +cv2 = try_cv2_import() + + +def _locobot_base_action_space(): + return spaces.Dict( + { + "go_to_relative": spaces.Box(low=-np.inf, high=np.inf, shape=(3,)), + "go_to_absolute": spaces.Box(low=-np.inf, high=np.inf, shape=(3,)), + } + ) + + +def _locobot_camera_action_space(): + return spaces.Dict( + { + "set_pan": spaces.Box(low=-np.inf, high=np.inf, shape=(1,)), + "set_tilt": spaces.Box(low=-np.inf, high=np.inf, shape=(1,)), + "set_pan_tilt": spaces.Box(low=-np.inf, high=np.inf, shape=(2,)), + } + ) + + +def _resize_observation(obs, observation_space, config): + if obs.shape != observation_space.shape: + if ( + config.CENTER_CROP is True + and obs.shape[0] > observation_space.shape[0] + and obs.shape[1] > observation_space.shape[1] + ): + obs = center_crop(obs, observation_space) + + else: + obs = cv2.resize( + obs, (observation_space.shape[1], observation_space.shape[0]) + ) + return obs + + +MM_IN_METER = 1000 # millimeters in a meter +ACTION_SPACES = { + "LOCOBOT": { + "BASE_ACTIONS": _locobot_base_action_space(), + "CAMERA_ACTIONS": _locobot_camera_action_space(), + } +} + + +@registry.register_sensor +class PyRobotRGBSensor(RGBSensor): + def __init__(self, config): + super().__init__(config=config) + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=255, + shape=(self.config.HEIGHT, self.config.WIDTH, 3), + dtype=np.uint8, + ) + + def get_observation(self, robot_obs, *args: Any, **kwargs: Any): + obs = robot_obs.get(self.uuid, None) + + assert obs is not None, "Invalid observation for {} sensor".format( + self.uuid + ) + + obs = _resize_observation(obs, self.observation_space, self.config) + + return obs + + +@registry.register_sensor +class PyRobotDepthSensor(DepthSensor): + min_depth_value: float + max_depth_value: float + + def __init__(self, config): + if config.NORMALIZE_DEPTH: + self.min_depth_value = 0 + self.max_depth_value = 1 + else: + self.min_depth_value = config.MIN_DEPTH + self.max_depth_value = config.MAX_DEPTH + + super().__init__(config=config) + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=self.min_depth_value, + high=self.max_depth_value, + shape=(self.config.HEIGHT, self.config.WIDTH, 1), + dtype=np.float32, + ) + + def get_observation(self, robot_obs, *args: Any, **kwargs: Any): + obs = robot_obs.get(self.uuid, None) + + assert obs is not None, "Invalid observation for {} sensor".format( + self.uuid + ) + + obs = _resize_observation(obs, self.observation_space, self.config) + + obs = obs / MM_IN_METER # convert from mm to m + + obs = np.clip(obs, self.config.MIN_DEPTH, self.config.MAX_DEPTH) + if self.config.NORMALIZE_DEPTH: + # normalize depth observations to [0, 1] + obs = (obs - self.config.MIN_DEPTH) / ( + self.config.MAX_DEPTH - self.config.MIN_DEPTH + ) + + obs = np.expand_dims(obs, axis=2) # make depth observations a 3D array + + return obs + + +@registry.register_sensor +class PyRobotBumpSensor(BumpSensor): + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box(low=False, high=True, shape=(1,), dtype=np.bool) + + def get_observation(self, robot_obs, *args: Any, **kwargs: Any): + return np.array(robot_obs["bump"]) + + +@registry.register_simulator(name="PyRobot-v0") +class PyRobot(Simulator): + r"""Simulator wrapper over PyRobot. + + PyRobot repo: https://github.com/facebookresearch/pyrobot + To use this abstraction the user will have to setup PyRobot + python3 version. Please refer to the PyRobot repository + for setting it up. The user will also have to export a + ROS_PATH environment variable to use this integration, + please refer to :ref:`habitat.core.utils.try_cv2_import` for + more details on this. + + This abstraction assumes that reality is a simulation + (https://www.youtube.com/watch?v=tlTKTTt47WE). + + Args: + config: configuration for initializing the PyRobot object. + """ + + def __init__(self, config: Config) -> None: + self._config = config + + robot_sensors = [] + for sensor_name in self._config.SENSORS: + sensor_cfg = getattr(self._config, sensor_name) + sensor_type = registry.get_sensor(sensor_cfg.TYPE) + + assert sensor_type is not None, "invalid sensor type {}".format( + sensor_cfg.TYPE + ) + robot_sensors.append(sensor_type(sensor_cfg)) + self._sensor_suite = SensorSuite(robot_sensors) + + config_pyrobot = { + "base_controller": self._config.BASE_CONTROLLER, + "base_planner": self._config.BASE_PLANNER, + } + + assert ( + self._config.ROBOT in self._config.ROBOTS + ), "Invalid robot type {}".format(self._config.ROBOT) + self._robot_config = getattr(self._config, self._config.ROBOT.upper()) + + self._action_space = self._robot_action_space( + self._config.ROBOT, self._robot_config + ) + + self._robot = pyrobot.Robot( + self._config.ROBOT, base_config=config_pyrobot + ) + + def get_robot_observations(self): + return { + "rgb": self._robot.camera.get_rgb(), + "depth": self._robot.camera.get_depth(), + "bump": self._robot.base.base_state.bumper, + } + + @property + def sensor_suite(self) -> SensorSuite: + return self._sensor_suite + + @property + def base(self): + return self._robot.base + + @property + def camera(self): + return self._robot.camera + + def _robot_action_space(self, robot_type, robot_config): + action_spaces_dict = {} + for action in robot_config.ACTIONS: + action_spaces_dict[action] = ACTION_SPACES[robot_type.upper()][ + action + ] + return spaces.Dict(action_spaces_dict) + + @property + def action_space(self) -> Space: + return self._action_space + + def reset(self): + self._robot.camera.reset() + + observations = self._sensor_suite.get_observations( + robot_obs=self.get_robot_observations() + ) + return observations + + def step(self, action, action_params): + r"""Step in reality. Currently the supported + actions are the ones defined in :ref:`_locobot_base_action_space` + and :ref:`_locobot_camera_action_space`. For details on how + to use these actions please refer to the documentation + of namesake methods in PyRobot + (https://github.com/facebookresearch/pyrobot). + """ + if action in self._robot_config.BASE_ACTIONS: + getattr(self._robot.base, action)(**action_params) + elif action in self._robot_config.CAMERA_ACTIONS: + getattr(self._robot.camera, action)(**action_params) + else: + raise ValueError("Invalid action {}".format(action)) + + observations = self._sensor_suite.get_observations( + robot_obs=self.get_robot_observations() + ) + + return observations + + def render(self, mode: str = "rgb") -> Any: + observations = self._sensor_suite.get_observations( + robot_obs=self.get_robot_observations() + ) + + output = observations.get(mode) + assert output is not None, "mode {} sensor is not active".format(mode) + + return output + + def get_agent_state( + self, agent_id: int = 0, base_state_type: str = "odom" + ): + assert agent_id == 0, "No support of multi agent in {} yet.".format( + self.__class__.__name__ + ) + state = { + "base": self._robot.base.get_state(base_state_type), + "camera": self._robot.camera.get_state(), + } + # TODO(akadian): add arm state when supported + return state + + def seed(self, seed: int) -> None: + raise NotImplementedError("No support for seeding in reality") diff --git a/habitat-lab-dialog/habitat/sims/registration.py b/habitat-lab-dialog/habitat/sims/registration.py new file mode 100644 index 0000000..3006b64 --- /dev/null +++ b/habitat-lab-dialog/habitat/sims/registration.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.sims.habitat_simulator import _try_register_habitat_sim +from habitat.sims.pyrobot import _try_register_pyrobot + + +def make_sim(id_sim, **kwargs): + logger.info("initializing sim {}".format(id_sim)) + _sim = registry.get_simulator(id_sim) + assert _sim is not None, "Could not find simulator with name {}".format( + id_sim + ) + return _sim(**kwargs) + + +_try_register_habitat_sim() +_try_register_pyrobot() diff --git a/habitat-lab-dialog/habitat/tasks/__init__.py b/habitat-lab-dialog/habitat/tasks/__init__.py new file mode 100644 index 0000000..95996cd --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.tasks.registration import make_task # noqa: F401 diff --git a/habitat-lab-dialog/habitat/tasks/eqa/__init__.py b/habitat-lab-dialog/habitat/tasks/eqa/__init__.py new file mode 100644 index 0000000..11dd265 --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/eqa/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.embodied_task import EmbodiedTask +from habitat.core.registry import registry + + +def _try_register_eqa_task(): + try: + from habitat.tasks.eqa.eqa import EQATask # noqa: F401 + except ImportError as e: + eqatask_import_error = e + + @registry.register_task(name="EQA-v0") + class EQATaskImportError(EmbodiedTask): + def __init__(self, *args, **kwargs): + raise eqatask_import_error diff --git a/habitat-lab-dialog/habitat/tasks/eqa/eqa.py b/habitat-lab-dialog/habitat/tasks/eqa/eqa.py new file mode 100644 index 0000000..3563cbb --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/eqa/eqa.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Optional + +import attr +from gym import Space, spaces + +from habitat.core.embodied_task import Action, Measure +from habitat.core.registry import registry +from habitat.core.simulator import Observations, Sensor, SensorTypes +from habitat.core.spaces import ListSpace +from habitat.core.utils import not_none_validator +from habitat.tasks.nav.nav import NavigationEpisode, NavigationTask + + +@attr.s(auto_attribs=True) +class QuestionData: + question_text: str + answer_text: str + question_tokens: Optional[List[str]] = None + answer_token: Optional[List[str]] = None + question_type: Optional[str] = None + + +@attr.s(auto_attribs=True, kw_only=True) +class EQAEpisode(NavigationEpisode): + r"""Specification of episode that includes initial position and rotation of + agent, goal, question specifications and optional shortest paths. + + Args: + scene_id: id of scene inside the simulator. + start_position: numpy ndarray containing 3 entries for (x, y, z). + start_rotation: numpy ndarray with 4 entries for (x, y, z, w) + elements of unit quaternion (versor) representing agent 3D + orientation. + goals: relevant goal object/room. + question: question related to goal object. + """ + + question: QuestionData = attr.ib( + default=None, validator=not_none_validator + ) + + +@registry.register_sensor +class QuestionSensor(Sensor): + def __init__(self, dataset, *args: Any, **kwargs: Any): + self._dataset = dataset + super().__init__(*args, **kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "question" + + def _get_sensor_type(self, *args: Any, **kwargs: Any) -> SensorTypes: + return SensorTypes.TOKEN_IDS + + def get_observation( + self, + observations: Dict[str, Observations], + episode: EQAEpisode, + *args: Any, + **kwargs: Any + ): + return episode.question.question_tokens + + def _get_observation_space(self, *args: Any, **kwargs: Any) -> Space: + return ListSpace( + spaces.Discrete(self._dataset.question_vocab.get_size()) + ) + + +@registry.register_measure +class CorrectAnswer(Measure): + """CorrectAnswer""" + + def __init__(self, dataset, *args: Any, **kwargs: Any): + self._dataset = dataset + super().__init__(**kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "correct_answer" + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self._metric = episode.question.answer_token + + def update_metric(self, *args: Any, **kwargs: Any): + pass + + +@registry.register_measure +class EpisodeInfo(Measure): + """Episode Info""" + + def __init__(self, sim, config, *args: Any, **kwargs: Any): + self._sim = sim + self._config = config + + super().__init__(**kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "episode_info" + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self._metric = vars(episode).copy() + + def update_metric(self, episode, action, *args: Any, **kwargs: Any): + pass + + +@registry.register_measure +class AnswerAccuracy(Measure): + """AnswerAccuracy""" + + def __init__(self, dataset, *args: Any, **kwargs: Any): + self._dataset = dataset + super().__init__(**kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "answer_accuracy" + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self._metric = 0 + + def update_metric( + self, action=None, episode=None, *args: Any, **kwargs: Any + ): + if episode is None: + return + + if action["action"] == AnswerAction.name: + self._metric = ( + 1 + if episode.question.answer_token + == action["action_args"]["answer_id"] + else 0 + ) + + +@registry.register_task(name="EQA-v0") +class EQATask(NavigationTask): + """ + Embodied Question Answering Task + Usage example: + env = habitat.Env(config=eqa_config) + + env.reset() + + for i in range(10): + action = sample_non_stop_action(env.action_space) + if action["action"] != AnswerAction.name: + env.step(action) + metrics = env.get_metrics() # to check distance to target + + correct_answer_id = env.current_episode.question.answer_token + env.step( + { + "action": AnswerAction.name, + "action_args": {"answer_id": correct_answer_id}, + } + ) + + metrics = env.get_metrics() + """ + + is_valid: bool = False + answer: Optional[int] = None + invalid_reason: Optional[str] = None + + def _check_episode_is_active( + self, *args, action, episode, action_args=None, **kwargs + ) -> bool: + return self.is_valid and self.answer is None + + +@registry.register_task_action +class AnswerAction(Action): + _answer: Optional[str] + name: str = "ANSWER" + + def __init__(self, *args: Any, sim, dataset, **kwargs: Any) -> None: + self._sim = sim + self._dataset = dataset + + def reset(self, task: EQATask, *args: Any, **kwargs: Any) -> None: + task.answer = None + task.is_valid = True + return + + def step( + self, *args: Any, answer_id: int, task: EQATask, **kwargs: Any + ) -> Dict[str, Observations]: + if task.answer is not None: + task.is_valid = False + task.invalid_reason = "Agent answered question twice." + + task.answer = answer_id + return self._sim.get_observations_at() + + @property + def action_space(self) -> spaces.Dict: + """Answer expected to be single token.""" + return spaces.Dict( + { + "answer_id": spaces.Discrete( + self._dataset.answer_vocab.get_size() + ) + } + ) diff --git a/habitat-lab-dialog/habitat/tasks/nav/__init__.py b/habitat-lab-dialog/habitat/tasks/nav/__init__.py new file mode 100644 index 0000000..a3abbec --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/nav/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.embodied_task import EmbodiedTask +from habitat.core.registry import registry + + +def _try_register_nav_task(): + try: + from habitat.tasks.nav.nav import NavigationTask # noqa + except ImportError as e: + navtask_import_error = e + + @registry.register_task(name="Nav-v0") + class NavigationTaskImportError(EmbodiedTask): + def __init__(self, *args, **kwargs): + raise navtask_import_error diff --git a/habitat-lab-dialog/habitat/tasks/nav/nav.py b/habitat-lab-dialog/habitat/tasks/nav/nav.py new file mode 100644 index 0000000..fdf7f8c --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/nav/nav.py @@ -0,0 +1,1140 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# TODO, lots of typing errors in here + +from typing import Any, List, Optional, Tuple + +import attr +import numpy as np +from gym import spaces + +from habitat.config import Config +from habitat.core.dataset import Dataset, Episode +from habitat.core.embodied_task import ( + EmbodiedTask, + Measure, + SimulatorTaskAction, +) +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.core.simulator import ( + AgentState, + RGBSensor, + Sensor, + SensorTypes, + ShortestPathPoint, + Simulator, +) +from habitat.core.utils import not_none_validator, try_cv2_import +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat.tasks.utils import cartesian_to_polar +from habitat.utils.geometry_utils import ( + quaternion_from_coeff, + quaternion_rotate_vector, +) +from habitat.utils.visualizations import fog_of_war, maps + +try: + from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim +except ImportError: + pass +cv2 = try_cv2_import() + + +MAP_THICKNESS_SCALAR: int = 128 + + +def merge_sim_episode_config(sim_config: Config, episode: Episode) -> Any: + sim_config.defrost() + sim_config.SCENE = episode.scene_id + sim_config.freeze() + if ( + episode.start_position is not None + and episode.start_rotation is not None + ): + agent_name = sim_config.AGENTS[sim_config.DEFAULT_AGENT_ID] + agent_cfg = getattr(sim_config, agent_name) + agent_cfg.defrost() + agent_cfg.START_POSITION = episode.start_position + agent_cfg.START_ROTATION = episode.start_rotation + agent_cfg.IS_SET_START_STATE = True + agent_cfg.freeze() + return sim_config + + +@attr.s(auto_attribs=True, kw_only=True) +class NavigationGoal: + r"""Base class for a goal specification hierarchy.""" + + position: List[float] = attr.ib(default=None, validator=not_none_validator) + radius: Optional[float] = None + + +@attr.s(auto_attribs=True, kw_only=True) +class RoomGoal(NavigationGoal): + r"""Room goal that can be specified by room_id or position with radius.""" + + room_id: str = attr.ib(default=None, validator=not_none_validator) + room_name: Optional[str] = None + + +@attr.s(auto_attribs=True, kw_only=True) +class NavigationEpisode(Episode): + r"""Class for episode specification that includes initial position and + rotation of agent, scene name, goal and optional shortest paths. An + episode is a description of one task instance for the agent. + + Args: + episode_id: id of episode in the dataset, usually episode number + scene_id: id of scene in scene dataset + start_position: numpy ndarray containing 3 entries for (x, y, z) + start_rotation: numpy ndarray with 4 entries for (x, y, z, w) + elements of unit quaternion (versor) representing agent 3D + orientation. ref: https://en.wikipedia.org/wiki/Versor + goals: list of goals specifications + start_room: room id + shortest_paths: list containing shortest paths to goals + """ + + goals: List[NavigationGoal] = attr.ib( + default=None, validator=not_none_validator + ) + start_room: Optional[str] = None + shortest_paths: Optional[List[List[ShortestPathPoint]]] = None + + +@registry.register_sensor +class PointGoalSensor(Sensor): + r"""Sensor for PointGoal observations which are used in PointGoal Navigation. + + For the agent in simulator the forward direction is along negative-z. + In polar coordinate format the angle returned is azimuth to the goal. + + Args: + sim: reference to the simulator for calculating task observations. + config: config for the PointGoal sensor. Can contain field for + GOAL_FORMAT which can be used to specify the format in which + the pointgoal is specified. Current options for goal format are + cartesian and polar. + + Also contains a DIMENSIONALITY field which specifes the number + of dimensions ued to specify the goal, must be in [2, 3] + + Attributes: + _goal_format: format for specifying the goal which can be done + in cartesian or polar coordinates. + _dimensionality: number of dimensions used to specify the goal + """ + cls_uuid: str = "pointgoal" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + + self._goal_format = getattr(config, "GOAL_FORMAT", "CARTESIAN") + assert self._goal_format in ["CARTESIAN", "POLAR"] + + self._dimensionality = getattr(config, "DIMENSIONALITY", 2) + assert self._dimensionality in [2, 3] + + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.PATH + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (self._dimensionality,) + + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=sensor_shape, + dtype=np.float32, + ) + + def _compute_pointgoal( + self, source_position, source_rotation, goal_position + ): + direction_vector = goal_position - source_position + direction_vector_agent = quaternion_rotate_vector( + source_rotation.inverse(), direction_vector + ) + + if self._goal_format == "POLAR": + if self._dimensionality == 2: + rho, phi = cartesian_to_polar( + -direction_vector_agent[2], direction_vector_agent[0] + ) + return np.array([rho, -phi], dtype=np.float32) + else: + _, phi = cartesian_to_polar( + -direction_vector_agent[2], direction_vector_agent[0] + ) + theta = np.arccos( + direction_vector_agent[1] + / np.linalg.norm(direction_vector_agent) + ) + rho = np.linalg.norm(direction_vector_agent) + + return np.array([rho, -phi, theta], dtype=np.float32) + else: + if self._dimensionality == 2: + return np.array( + [-direction_vector_agent[2], direction_vector_agent[0]], + dtype=np.float32, + ) + else: + return direction_vector_agent + + def get_observation( + self, + observations, + episode: NavigationEpisode, + *args: Any, + **kwargs: Any, + ): + source_position = np.array(episode.start_position, dtype=np.float32) + rotation_world_start = quaternion_from_coeff(episode.start_rotation) + goal_position = np.array(episode.goals[0].position, dtype=np.float32) + + return self._compute_pointgoal( + source_position, rotation_world_start, goal_position + ) + + +@registry.register_sensor +class ImageGoalSensor(Sensor): + r"""Sensor for ImageGoal observations which are used in ImageGoal Navigation. + + RGBSensor needs to be one of the Simulator sensors. + This sensor return the rgb image taken from the goal position to reach with + random rotation. + + Args: + sim: reference to the simulator for calculating task observations. + config: config for the ImageGoal sensor. + """ + cls_uuid: str = "imagegoal" + + def __init__( + self, *args: Any, sim: Simulator, config: Config, **kwargs: Any + ): + self._sim = sim + sensors = self._sim.sensor_suite.sensors + rgb_sensor_uuids = [ + uuid + for uuid, sensor in sensors.items() + if isinstance(sensor, RGBSensor) + ] + if len(rgb_sensor_uuids) != 1: + raise ValueError( + f"ImageGoalNav requires one RGB sensor, {len(rgb_sensor_uuids)} detected" + ) + + (self._rgb_sensor_uuid,) = rgb_sensor_uuids + self._current_episode_id: Optional[str] = None + self._current_image_goal = None + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.PATH + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return self._sim.sensor_suite.observation_spaces.spaces[ + self._rgb_sensor_uuid + ] + + def _get_pointnav_episode_image_goal(self, episode: NavigationEpisode): + goal_position = np.array(episode.goals[0].position, dtype=np.float32) + # to be sure that the rotation is the same for the same episode_id + # since the task is currently using pointnav Dataset. + seed = abs(hash(episode.episode_id)) % (2 ** 32) + rng = np.random.RandomState(seed) + angle = rng.uniform(0, 2 * np.pi) + source_rotation = [0, np.sin(angle / 2), 0, np.cos(angle / 2)] + goal_observation = self._sim.get_observations_at( + position=goal_position.tolist(), rotation=source_rotation + ) + return goal_observation[self._rgb_sensor_uuid] + + def get_observation( + self, + *args: Any, + observations, + episode: NavigationEpisode, + **kwargs: Any, + ): + episode_uniq_id = f"{episode.scene_id} {episode.episode_id}" + if episode_uniq_id == self._current_episode_id: + return self._current_image_goal + + self._current_image_goal = self._get_pointnav_episode_image_goal( + episode + ) + self._current_episode_id = episode_uniq_id + + return self._current_image_goal + + +@registry.register_sensor(name="PointGoalWithGPSCompassSensor") +class IntegratedPointGoalGPSAndCompassSensor(PointGoalSensor): + r"""Sensor that integrates PointGoals observations (which are used PointGoal Navigation) and GPS+Compass. + + For the agent in simulator the forward direction is along negative-z. + In polar coordinate format the angle returned is azimuth to the goal. + + Args: + sim: reference to the simulator for calculating task observations. + config: config for the PointGoal sensor. Can contain field for + GOAL_FORMAT which can be used to specify the format in which + the pointgoal is specified. Current options for goal format are + cartesian and polar. + + Also contains a DIMENSIONALITY field which specifes the number + of dimensions ued to specify the goal, must be in [2, 3] + + Attributes: + _goal_format: format for specifying the goal which can be done + in cartesian or polar coordinates. + _dimensionality: number of dimensions used to specify the goal + """ + cls_uuid: str = "pointgoal_with_gps_compass" + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def get_observation( + self, observations, episode, *args: Any, **kwargs: Any + ): + agent_state = self._sim.get_agent_state() + agent_position = agent_state.position + rotation_world_agent = agent_state.rotation + goal_position = np.array(episode.goals[0].position, dtype=np.float32) + + return self._compute_pointgoal( + agent_position, rotation_world_agent, goal_position + ) + + +@registry.register_sensor +class HeadingSensor(Sensor): + r"""Sensor for observing the agent's heading in the global coordinate + frame. + + Args: + sim: reference to the simulator for calculating task observations. + config: config for the sensor. + """ + cls_uuid: str = "heading" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.HEADING + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box(low=-np.pi, high=np.pi, shape=(1,), dtype=np.float) + + def _quat_to_xy_heading(self, quat): + direction_vector = np.array([0, 0, -1]) + + heading_vector = quaternion_rotate_vector(quat, direction_vector) + + phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1] + return np.array([phi], dtype=np.float32) + + def get_observation( + self, observations, episode, *args: Any, **kwargs: Any + ): + agent_state = self._sim.get_agent_state() + rotation_world_agent = agent_state.rotation + + return self._quat_to_xy_heading(rotation_world_agent.inverse()) + + +@registry.register_sensor(name="CompassSensor") +class EpisodicCompassSensor(HeadingSensor): + r"""The agents heading in the coordinate frame defined by the epiosde, + theta=0 is defined by the agents state at t=0 + """ + cls_uuid: str = "compass" + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def get_observation( + self, observations, episode, *args: Any, **kwargs: Any + ): + agent_state = self._sim.get_agent_state() + rotation_world_agent = agent_state.rotation + rotation_world_start = quaternion_from_coeff(episode.start_rotation) + + return self._quat_to_xy_heading( + rotation_world_agent.inverse() * rotation_world_start + ) + + +@registry.register_sensor(name="GPSSensor") +class EpisodicGPSSensor(Sensor): + r"""The agents current location in the coordinate frame defined by the episode, + i.e. the axis it faces along and the origin is defined by its state at t=0 + + Args: + sim: reference to the simulator for calculating task observations. + config: Contains the DIMENSIONALITY field for the number of dimensions to express the agents position + Attributes: + _dimensionality: number of dimensions used to specify the agents position + """ + cls_uuid: str = "gps" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + + self._dimensionality = getattr(config, "DIMENSIONALITY", 2) + assert self._dimensionality in [2, 3] + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.POSITION + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (self._dimensionality,) + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=sensor_shape, + dtype=np.float32, + ) + + def get_observation( + self, observations, episode, *args: Any, **kwargs: Any + ): + agent_state = self._sim.get_agent_state() + + origin = np.array(episode.start_position, dtype=np.float32) + rotation_world_start = quaternion_from_coeff(episode.start_rotation) + + agent_position = agent_state.position + + agent_position = quaternion_rotate_vector( + rotation_world_start.inverse(), agent_position - origin + ) + if self._dimensionality == 2: + return np.array( + [-agent_position[2], agent_position[0]], dtype=np.float32 + ) + else: + return agent_position.astype(np.float32) + + +@registry.register_sensor +class ProximitySensor(Sensor): + r"""Sensor for observing the distance to the closest obstacle + + Args: + sim: reference to the simulator for calculating task observations. + config: config for the sensor. + """ + cls_uuid: str = "proximity" + + def __init__(self, sim, config, *args: Any, **kwargs: Any): + self._sim = sim + self._max_detection_radius = getattr( + config, "MAX_DETECTION_RADIUS", 2.0 + ) + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.TACTILE + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0.0, + high=self._max_detection_radius, + shape=(1,), + dtype=np.float32, + ) + + def get_observation( + self, observations, *args: Any, episode, **kwargs: Any + ): + current_position = self._sim.get_agent_state().position + + return np.array( + [ + self._sim.distance_to_closest_obstacle( + current_position, self._max_detection_radius + ) + ], + dtype=np.float32, + ) + + +@registry.register_measure +class Success(Measure): + r"""Whether or not the agent succeeded at its task + + This measure depends on DistanceToGoal measure. + """ + + cls_uuid: str = "success" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + self._config = config + + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def reset_metric(self, episode, task, *args: Any, **kwargs: Any): + task.measurements.check_measure_dependencies( + self.uuid, [DistanceToGoal.cls_uuid] + ) + self.update_metric(episode=episode, task=task, *args, **kwargs) # type: ignore + + def update_metric( + self, episode, task: EmbodiedTask, *args: Any, **kwargs: Any + ): + distance_to_target = task.measurements.measures[ + DistanceToGoal.cls_uuid + ].get_metric() + + if ( + hasattr(task, "is_stop_called") + and task.is_stop_called # type: ignore + and distance_to_target < self._config.SUCCESS_DISTANCE + ): + self._metric = 1.0 + else: + self._metric = 0.0 + + +@registry.register_measure +class SPL(Measure): + r"""SPL (Success weighted by Path Length) + + ref: On Evaluation of Embodied Agents - Anderson et. al + https://arxiv.org/pdf/1807.06757.pdf + The measure depends on Distance to Goal measure and Success measure + to improve computational + performance for sophisticated goal areas. + """ + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._previous_position = None + self._start_end_episode_distance = None + self._agent_episode_distance: Optional[float] = None + self._episode_view_points = None + self._sim = sim + self._config = config + + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "spl" + + def reset_metric(self, episode, task, *args: Any, **kwargs: Any): + task.measurements.check_measure_dependencies( + self.uuid, [DistanceToGoal.cls_uuid, Success.cls_uuid] + ) + + self._previous_position = self._sim.get_agent_state().position + self._agent_episode_distance = 0.0 + self._start_end_episode_distance = task.measurements.measures[ + DistanceToGoal.cls_uuid + ].get_metric() + self.update_metric( # type:ignore + episode=episode, task=task, *args, **kwargs + ) + + def _euclidean_distance(self, position_a, position_b): + return np.linalg.norm(position_b - position_a, ord=2) + + def update_metric( + self, episode, task: EmbodiedTask, *args: Any, **kwargs: Any + ): + ep_success = task.measurements.measures[Success.cls_uuid].get_metric() + + current_position = self._sim.get_agent_state().position + self._agent_episode_distance += self._euclidean_distance( + current_position, self._previous_position + ) + + self._previous_position = current_position + ''' + self._metric = ep_success * ( + self._start_end_episode_distance + / max( + self._start_end_episode_distance, self._agent_episode_distance + ) + ) + ''' + try: + self._metric = ep_success * ( + self._start_end_episode_distance + / max( + self._start_end_episode_distance, self._agent_episode_distance + ) + ) + except ZeroDivisionError: + self._metric = 0 + + + + +@registry.register_measure +class SoftSPL(SPL): + r"""Soft SPL + + Similar to SPL with a relaxed soft-success criteria. Instead of a boolean + success is now calculated as 1 - (ratio of distance covered to target). + """ + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "softspl" + + def reset_metric(self, episode, task, *args: Any, **kwargs: Any): + task.measurements.check_measure_dependencies( + self.uuid, [DistanceToGoal.cls_uuid] + ) + + self._previous_position = self._sim.get_agent_state().position + self._agent_episode_distance = 0.0 + self._start_end_episode_distance = task.measurements.measures[ + DistanceToGoal.cls_uuid + ].get_metric() + self.update_metric(episode=episode, task=task, *args, **kwargs) # type: ignore + + def update_metric(self, episode, task, *args: Any, **kwargs: Any): + current_position = self._sim.get_agent_state().position + distance_to_target = task.measurements.measures[ + DistanceToGoal.cls_uuid + ].get_metric() + ''' + ep_soft_success = max( + 0, (1 - distance_to_target / self._start_end_episode_distance) + ) + ''' + try: + ep_soft_success = max( + 0, (1 - distance_to_target / self._start_end_episode_distance) + ) + except ZeroDivisionError: + ep_soft_success = 0 + + + self._agent_episode_distance += self._euclidean_distance( + current_position, self._previous_position + ) + + self._previous_position = current_position + ''' + self._metric = ep_soft_success * ( + self._start_end_episode_distance + / max( + self._start_end_episode_distance, self._agent_episode_distance + ) + ) + ''' + try: + self._metric = ep_soft_success * ( + self._start_end_episode_distance + / max( + self._start_end_episode_distance, self._agent_episode_distance + ) + ) + except ZeroDivisionError: + self._metric = 0 + + +@registry.register_measure +class Collisions(Measure): + def __init__(self, sim, config, *args: Any, **kwargs: Any): + self._sim = sim + self._config = config + self._metric = None + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "collisions" + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self._metric = None + + def update_metric(self, episode, action, *args: Any, **kwargs: Any): + if self._metric is None: + self._metric = {"count": 0, "is_collision": False} + self._metric["is_collision"] = False + if self._sim.previous_step_collided: + self._metric["count"] += 1 + self._metric["is_collision"] = True + + +@registry.register_measure +class TopDownMap(Measure): + r"""Top Down Map measure""" + + def __init__( + self, sim: "HabitatSim", config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + self._config = config + self._grid_delta = config.MAP_PADDING + self._step_count: Optional[int] = None + self._map_resolution = config.MAP_RESOLUTION + self._ind_x_min: Optional[int] = None + self._ind_x_max: Optional[int] = None + self._ind_y_min: Optional[int] = None + self._ind_y_max: Optional[int] = None + self._previous_xy_location: Optional[Tuple[int, int]] = None + self._top_down_map: Optional[np.ndarray] = None + self._shortest_path_points: Optional[List[Tuple[int, int]]] = None + self.line_thickness = int( + np.round(self._map_resolution * 2 / MAP_THICKNESS_SCALAR) + ) + self.point_padding = 2 * int( + np.ceil(self._map_resolution / MAP_THICKNESS_SCALAR) + ) + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "top_down_map" + + def get_original_map(self): + top_down_map = maps.get_topdown_map_from_sim( + self._sim, + map_resolution=self._map_resolution, + draw_border=self._config.DRAW_BORDER, + ) + + if self._config.FOG_OF_WAR.DRAW: + self._fog_of_war_mask = np.zeros_like(top_down_map) + else: + self._fog_of_war_mask = None + + return top_down_map + + def _draw_point(self, position, point_type): + t_x, t_y = maps.to_grid( + position[2], + position[0], + self._top_down_map.shape[0:2], + sim=self._sim, + ) + self._top_down_map[ + t_x - self.point_padding : t_x + self.point_padding + 1, + t_y - self.point_padding : t_y + self.point_padding + 1, + ] = point_type + + def _draw_goals_view_points(self, episode): + if self._config.DRAW_VIEW_POINTS: + for goal in episode.goals: + if self._is_on_same_floor(goal.position[1]): + try: + if goal.view_points is not None: + for view_point in goal.view_points: + self._draw_point( + view_point.agent_state.position, + maps.MAP_VIEW_POINT_INDICATOR, + ) + except AttributeError: + pass + + def _draw_goals_positions(self, episode): + if self._config.DRAW_GOAL_POSITIONS: + + for goal in episode.goals: + if self._is_on_same_floor(goal.position[1]): + try: + self._draw_point( + goal.position, maps.MAP_TARGET_POINT_INDICATOR + ) + except AttributeError: + pass + + def _draw_goals_aabb(self, episode): + if self._config.DRAW_GOAL_AABBS: + for goal in episode.goals: + try: + sem_scene = self._sim.semantic_annotations() + object_id = goal.object_id + assert int( + sem_scene.objects[object_id].id.split("_")[-1] + ) == int( + goal.object_id + ), f"Object_id doesn't correspond to id in semantic scene objects dictionary for episode: {episode}" + + center = sem_scene.objects[object_id].aabb.center + x_len, _, z_len = ( + sem_scene.objects[object_id].aabb.sizes / 2.0 + ) + # Nodes to draw rectangle + corners = [ + center + np.array([x, 0, z]) + for x, z in [ + (-x_len, -z_len), + (-x_len, z_len), + (x_len, z_len), + (x_len, -z_len), + (-x_len, -z_len), + ] + if self._is_on_same_floor(center[1]) + ] + + map_corners = [ + maps.to_grid( + p[2], + p[0], + self._top_down_map.shape[0:2], + sim=self._sim, + ) + for p in corners + ] + + maps.draw_path( + self._top_down_map, + map_corners, + maps.MAP_TARGET_BOUNDING_BOX, + self.line_thickness, + ) + except AttributeError: + pass + + def _draw_shortest_path( + self, episode: NavigationEpisode, agent_position: AgentState + ): + if self._config.DRAW_SHORTEST_PATH: + _shortest_path_points = ( + self._sim.get_straight_shortest_path_points( + agent_position, episode.goals[0].position + ) + ) + self._shortest_path_points = [ + maps.to_grid( + p[2], p[0], self._top_down_map.shape[0:2], sim=self._sim + ) + for p in _shortest_path_points + ] + maps.draw_path( + self._top_down_map, + self._shortest_path_points, + maps.MAP_SHORTEST_PATH_COLOR, + self.line_thickness, + ) + + def _is_on_same_floor( + self, height, ref_floor_height=None, ceiling_height=2.0 + ): + if ref_floor_height is None: + ref_floor_height = self._sim.get_agent(0).state.position[1] + return ref_floor_height < height < ref_floor_height + ceiling_height + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self._step_count = 0 + self._metric = None + self._top_down_map = self.get_original_map() + agent_position = self._sim.get_agent_state().position + a_x, a_y = maps.to_grid( + agent_position[2], + agent_position[0], + self._top_down_map.shape[0:2], + sim=self._sim, + ) + self._previous_xy_location = (a_y, a_x) + + self.update_fog_of_war_mask(np.array([a_x, a_y])) + + # draw source and target parts last to avoid overlap + self._draw_goals_view_points(episode) + self._draw_goals_aabb(episode) + self._draw_goals_positions(episode) + + self._draw_shortest_path(episode, agent_position) + + if self._config.DRAW_SOURCE: + self._draw_point( + episode.start_position, maps.MAP_SOURCE_POINT_INDICATOR + ) + + def update_metric(self, episode, action, *args: Any, **kwargs: Any): + self._step_count += 1 + house_map, map_agent_x, map_agent_y = self.update_map( + self._sim.get_agent_state().position + ) + + self._metric = { + "map": house_map, + "fog_of_war_mask": self._fog_of_war_mask, + "agent_map_coord": (map_agent_x, map_agent_y), + "agent_angle": self.get_polar_angle(), + } + + def get_polar_angle(self): + agent_state = self._sim.get_agent_state() + # quaternion is in x, y, z, w format + ref_rotation = agent_state.rotation + + heading_vector = quaternion_rotate_vector( + ref_rotation.inverse(), np.array([0, 0, -1]) + ) + + phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1] + z_neg_z_flip = np.pi + return np.array(phi) + z_neg_z_flip + + def update_map(self, agent_position): + a_x, a_y = maps.to_grid( + agent_position[2], + agent_position[0], + self._top_down_map.shape[0:2], + sim=self._sim, + ) + # Don't draw over the source point + if self._top_down_map[a_x, a_y] != maps.MAP_SOURCE_POINT_INDICATOR: + color = 10 + min( + self._step_count * 245 // self._config.MAX_EPISODE_STEPS, 245 + ) + + thickness = self.line_thickness + cv2.line( + self._top_down_map, + self._previous_xy_location, + (a_y, a_x), + color, + thickness=thickness, + ) + + self.update_fog_of_war_mask(np.array([a_x, a_y])) + + self._previous_xy_location = (a_y, a_x) + return self._top_down_map, a_x, a_y + + def update_fog_of_war_mask(self, agent_position): + if self._config.FOG_OF_WAR.DRAW: + self._fog_of_war_mask = fog_of_war.reveal_fog_of_war( + self._top_down_map, + self._fog_of_war_mask, + agent_position, + self.get_polar_angle(), + fov=self._config.FOG_OF_WAR.FOV, + max_line_len=self._config.FOG_OF_WAR.VISIBILITY_DIST + / maps.calculate_meters_per_pixel( + self._map_resolution, sim=self._sim + ), + ) + + +@registry.register_measure +class DistanceToGoal(Measure): + """The measure calculates a distance towards the goal.""" + + cls_uuid: str = "distance_to_goal" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._previous_position: Optional[Tuple[float, float, float]] = None + self._sim = sim + self._config = config + self._episode_view_points: Optional[ + List[Tuple[float, float, float]] + ] = None + + super().__init__(**kwargs) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def reset_metric(self, episode, *args: Any, **kwargs: Any): + self._previous_position = None + self._metric = None + if self._config.DISTANCE_TO == "VIEW_POINTS": + self._episode_view_points = [ + view_point.agent_state.position + for goal in episode.goals + for view_point in goal.view_points + ] + self.update_metric(episode=episode, *args, **kwargs) # type: ignore + + def update_metric( + self, episode: NavigationEpisode, *args: Any, **kwargs: Any + ): + current_position = self._sim.get_agent_state().position + + if self._previous_position is None or not np.allclose( + self._previous_position, current_position, atol=1e-4 + ): + if self._config.DISTANCE_TO == "POINT": + distance_to_target = self._sim.geodesic_distance( + current_position, + [goal.position for goal in episode.goals], + episode, + ) + elif self._config.DISTANCE_TO == "VIEW_POINTS": + distance_to_target = self._sim.geodesic_distance( + current_position, self._episode_view_points, episode + ) + else: + logger.error( + f"Non valid DISTANCE_TO parameter was provided: {self._config.DISTANCE_TO}" + ) + + self._previous_position = current_position + self._metric = distance_to_target + + +@registry.register_task_action +class MoveForwardAction(SimulatorTaskAction): + name: str = "MOVE_FORWARD" + + def step(self, *args: Any, **kwargs: Any): + r"""Update ``_metric``, this method is called from ``Env`` on each + ``step``. + """ + return self._sim.step(HabitatSimActions.MOVE_FORWARD) + + +@registry.register_task_action +class TurnLeftAction(SimulatorTaskAction): + def step(self, *args: Any, **kwargs: Any): + r"""Update ``_metric``, this method is called from ``Env`` on each + ``step``. + """ + return self._sim.step(HabitatSimActions.TURN_LEFT) + + +@registry.register_task_action +class TurnRightAction(SimulatorTaskAction): + def step(self, *args: Any, **kwargs: Any): + r"""Update ``_metric``, this method is called from ``Env`` on each + ``step``. + """ + return self._sim.step(HabitatSimActions.TURN_RIGHT) + + +@registry.register_task_action +class StopAction(SimulatorTaskAction): + name: str = "STOP" + + def reset(self, task: EmbodiedTask, *args: Any, **kwargs: Any): + task.is_stop_called = False # type: ignore + + def step(self, task: EmbodiedTask, *args: Any, **kwargs: Any): + r"""Update ``_metric``, this method is called from ``Env`` on each + ``step``. + """ + task.is_stop_called = True # type: ignore + return self._sim.get_observations_at() # type: ignore + + +@registry.register_task_action +class LookUpAction(SimulatorTaskAction): + def step(self, *args: Any, **kwargs: Any): + r"""Update ``_metric``, this method is called from ``Env`` on each + ``step``. + """ + return self._sim.step(HabitatSimActions.LOOK_UP) + + +@registry.register_task_action +class LookDownAction(SimulatorTaskAction): + def step(self, *args: Any, **kwargs: Any): + r"""Update ``_metric``, this method is called from ``Env`` on each + ``step``. + """ + return self._sim.step(HabitatSimActions.LOOK_DOWN) + + +@registry.register_task_action +class TeleportAction(SimulatorTaskAction): + # TODO @maksymets: Propagate through Simulator class + COORDINATE_EPSILON = 1e-6 + COORDINATE_MIN = -62.3241 - COORDINATE_EPSILON + COORDINATE_MAX = 90.0399 + COORDINATE_EPSILON + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return "TELEPORT" + + def step( + self, + *args: Any, + position: List[float], + rotation: List[float], + **kwargs: Any, + ): + r"""Update ``_metric``, this method is called from ``Env`` on each + ``step``. + """ + + if not isinstance(rotation, list): + rotation = list(rotation) + + if not self._sim.is_navigable(position): + return self._sim.get_observations_at() # type: ignore + + return self._sim.get_observations_at( + position=position, rotation=rotation, keep_agent_at_new_pose=True + ) + + @property + def action_space(self) -> spaces.Dict: + return spaces.Dict( + { + "position": spaces.Box( + low=np.array([self.COORDINATE_MIN] * 3), + high=np.array([self.COORDINATE_MAX] * 3), + dtype=np.float32, + ), + "rotation": spaces.Box( + low=np.array([-1.0, -1.0, -1.0, -1.0]), + high=np.array([1.0, 1.0, 1.0, 1.0]), + dtype=np.float32, + ), + } + ) + + +@registry.register_task(name="Nav-v0") +class NavigationTask(EmbodiedTask): + def __init__( + self, config: Config, sim: Simulator, dataset: Optional[Dataset] = None + ) -> None: + super().__init__(config=config, sim=sim, dataset=dataset) + + def overwrite_sim_config(self, sim_config: Any, episode: Episode) -> Any: + return merge_sim_episode_config(sim_config, episode) + + def _check_episode_is_active(self, *args: Any, **kwargs: Any) -> bool: + return not getattr(self, "is_stop_called", False) diff --git a/habitat-lab-dialog/habitat/tasks/nav/object_nav_task.py b/habitat-lab-dialog/habitat/tasks/nav/object_nav_task.py new file mode 100644 index 0000000..ddb357f --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/nav/object_nav_task.py @@ -0,0 +1,184 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +from typing import Any, List, Optional + +import attr +import numpy as np +from gym import spaces + +from habitat.config import Config +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.core.simulator import AgentState, Sensor, SensorTypes +from habitat.core.utils import not_none_validator +from habitat.tasks.nav.nav import ( + NavigationEpisode, + NavigationGoal, + NavigationTask, +) + +try: + from habitat.datasets.object_nav.object_nav_dataset import ( + ObjectNavDatasetV1, + ) +except ImportError: + pass + + +@attr.s(auto_attribs=True, kw_only=True) +class ObjectGoalNavEpisode(NavigationEpisode): + r"""ObjectGoal Navigation Episode + + :param object_category: Category of the obect + """ + object_category: Optional[str] = None + + @property + def goals_key(self) -> str: + r"""The key to retrieve the goals""" + return f"{os.path.basename(self.scene_id)}_{self.object_category}" + + +@attr.s(auto_attribs=True) +class ObjectViewLocation: + r"""ObjectViewLocation provides information about a position around an object goal + usually that is navigable and the object is visible with specific agent + configuration that episode's dataset was created. + that is target for + navigation. That can be specify object_id, position and object + category. An important part for metrics calculation are view points that + describe success area for the navigation. + + Args: + agent_state: navigable AgentState with a position and a rotation where + the object is visible. + iou: an intersection of a union of the object and a rectangle in the + center of view. This metric is used to evaluate how good is the object + view form current position. Higher iou means better view, iou equals + 1.0 if whole object is inside of the rectangle and no pixel inside + the rectangle belongs to anything except the object. + """ + agent_state: AgentState + iou: Optional[float] + + +@attr.s(auto_attribs=True, kw_only=True) +class ObjectGoal(NavigationGoal): + r"""Object goal provides information about an object that is target for + navigation. That can be specify object_id, position and object + category. An important part for metrics calculation are view points that + describe success area for the navigation. + + Args: + object_id: id that can be used to retrieve object from the semantic + scene annotation + object_name: name of the object + object_category: object category name usually similar to scene semantic + categories + room_id: id of a room where object is located, can be used to retrieve + room from the semantic scene annotation + room_name: name of the room, where object is located + view_points: navigable positions around the object with specified + proximity of the object surface used for navigation metrics calculation. + The object is visible from these positions. + """ + + object_id: str = attr.ib(default=None, validator=not_none_validator) + object_name: Optional[str] = None + object_name_id: Optional[int] = None + object_category: Optional[str] = None + room_id: Optional[str] = None + room_name: Optional[str] = None + view_points: Optional[List[ObjectViewLocation]] = None + + +@registry.register_sensor +class ObjectGoalSensor(Sensor): + r"""A sensor for Object Goal specification as observations which is used in + ObjectGoal Navigation. The goal is expected to be specified by object_id or + semantic category id. + For the agent in simulator the forward direction is along negative-z. + In polar coordinate format the angle returned is azimuth to the goal. + Args: + sim: a reference to the simulator for calculating task observations. + config: a config for the ObjectGoalSensor sensor. Can contain field + GOAL_SPEC that specifies which id use for goal specification, + GOAL_SPEC_MAX_VAL the maximum object_id possible used for + observation space definition. + dataset: a Object Goal navigation dataset that contains dictionaries + of categories id to text mapping. + """ + cls_uuid: str = "objectgoal" + + def __init__( + self, + sim, + config: Config, + dataset: "ObjectNavDatasetV1", + *args: Any, + **kwargs: Any, + ): + self._sim = sim + self._dataset = dataset + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.SEMANTIC + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (1,) + max_value = self.config.GOAL_SPEC_MAX_VAL - 1 + if self.config.GOAL_SPEC == "TASK_CATEGORY_ID": + max_value = max( + self._dataset.category_to_task_category_id.values() + ) + + return spaces.Box( + low=0, high=max_value, shape=sensor_shape, dtype=np.int64 + ) + + def get_observation( + self, + observations, + *args: Any, + episode: ObjectGoalNavEpisode, + **kwargs: Any, + ) -> Optional[int]: + + if len(episode.goals) == 0: + logger.error( + f"No goal specified for episode {episode.episode_id}." + ) + return None + if not isinstance(episode.goals[0], ObjectGoal): + logger.error( + f"First goal should be ObjectGoal, episode {episode.episode_id}." + ) + return None + category_name = episode.object_category + if self.config.GOAL_SPEC == "TASK_CATEGORY_ID": + return np.array( + [self._dataset.category_to_task_category_id[category_name]], + dtype=np.int64, + ) + elif self.config.GOAL_SPEC == "OBJECT_ID": + obj_goal = episode.goals[0] + assert isinstance(obj_goal, ObjectGoal) # for type checking + return np.array([obj_goal.object_name_id], dtype=np.int64) + else: + raise RuntimeError( + "Wrong GOAL_SPEC specified for ObjectGoalSensor." + ) + + +@registry.register_task(name="ObjectNav-v1") +class ObjectNavigationTask(NavigationTask): + r"""An Object Navigation Task class for a task specific methods. + Used to explicitly state a type of the task in config. + """ diff --git a/habitat-lab-dialog/habitat/tasks/nav/shortest_path_follower.py b/habitat-lab-dialog/habitat/tasks/nav/shortest_path_follower.py new file mode 100644 index 0000000..fc200d8 --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/nav/shortest_path_follower.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import warnings +from typing import Optional, Union + +import numpy as np + +import habitat_sim +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim + + +def action_to_one_hot(action: int) -> np.array: + one_hot = np.zeros(len(HabitatSimActions), dtype=np.float32) + one_hot[action] = 1 + return one_hot + + +class ShortestPathFollower: + r"""Utility class for extracting the action on the shortest path to the + goal. + + :param sim: HabitatSim instance. + :param goal_radius: Distance between the agent and the goal for it to be + considered successful. + :param return_one_hot: If true, returns a one-hot encoding of the action + (useful for training ML agents). If false, returns the + SimulatorAction. + :param stop_on_error: Return stop if the follower is unable to determine a + suitable action to take next. If false, will raise + a habitat_sim.errors.GreedyFollowerError instead + """ + + def __init__( + self, + sim: HabitatSim, + goal_radius: float, + return_one_hot: bool = True, + stop_on_error: bool = True, + ): + + self._return_one_hot = return_one_hot + self._sim = sim + self._goal_radius = goal_radius + self._follower: Optional[habitat_sim.GreedyGeodesicFollower] = None + self._current_scene = None + self._stop_on_error = stop_on_error + + def _build_follower(self): + if self._current_scene != self._sim.habitat_config.SCENE: + self._follower = self._sim.make_greedy_follower( + 0, + self._goal_radius, + stop_key=HabitatSimActions.STOP, + forward_key=HabitatSimActions.MOVE_FORWARD, + left_key=HabitatSimActions.TURN_LEFT, + right_key=HabitatSimActions.TURN_RIGHT, + ) + self._current_scene = self._sim.habitat_config.SCENE + + def _get_return_value(self, action) -> Union[int, np.array]: + if self._return_one_hot: + return action_to_one_hot(action) + else: + return action + + def get_next_action( + self, goal_pos: np.array + ) -> Optional[Union[int, np.array]]: + """Returns the next action along the shortest path.""" + self._build_follower() + assert self._follower is not None + try: + next_action = self._follower.next_action_along(goal_pos) + except habitat_sim.errors.GreedyFollowerError as e: + if self._stop_on_error: + next_action = HabitatSimActions.STOP + else: + raise e + + return self._get_return_value(next_action) + + @property + def mode(self): + warnings.warn(".mode is depricated", DeprecationWarning) + return "" + + @mode.setter + def mode(self, new_mode: str): + warnings.warn(".mode is depricated", DeprecationWarning) diff --git a/habitat-lab-dialog/habitat/tasks/registration.py b/habitat-lab-dialog/habitat/tasks/registration.py new file mode 100644 index 0000000..7e4d035 --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/registration.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.tasks.eqa import _try_register_eqa_task +from habitat.tasks.nav import _try_register_nav_task +from habitat.tasks.vln import _try_register_vln_task + + +def make_task(id_task, **kwargs): + logger.info("Initializing task {}".format(id_task)) + _task = registry.get_task(id_task) + assert _task is not None, "Could not find task with name {}".format( + id_task + ) + + return _task(**kwargs) + + +_try_register_eqa_task() +_try_register_nav_task() +_try_register_vln_task() diff --git a/habitat-lab-dialog/habitat/tasks/utils.py b/habitat-lab-dialog/habitat/tasks/utils.py new file mode 100644 index 0000000..220e68c --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/utils.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import quaternion # noqa # pylint: disable=unused-import + + +def quaternion_to_rotation(q_r, q_i, q_j, q_k): + r""" + ref: https://en.wikipedia.org/wiki/Quaternions_and_spatial_rotation + """ + s = 1 # unit quaternion + rotation_mat = np.array( + [ + [ + 1 - 2 * s * (q_j ** 2 + q_k ** 2), + 2 * s * (q_i * q_j - q_k * q_r), + 2 * s * (q_i * q_k + q_j * q_r), + ], + [ + 2 * s * (q_i * q_j + q_k * q_r), + 1 - 2 * s * (q_i ** 2 + q_k ** 2), + 2 * s * (q_j * q_k - q_i * q_r), + ], + [ + 2 * s * (q_i * q_k - q_j * q_r), + 2 * s * (q_j * q_k + q_i * q_r), + 1 - 2 * s * (q_i ** 2 + q_j ** 2), + ], + ], + dtype=np.float32, + ) + return rotation_mat + + +def cartesian_to_polar(x, y): + rho = np.sqrt(x ** 2 + y ** 2) + phi = np.arctan2(y, x) + return rho, phi + + +def compute_pixel_coverage(instance_seg, object_id): + cand_mask = instance_seg == object_id + score = cand_mask.sum().astype(np.float64) / cand_mask.size + return score diff --git a/habitat-lab-dialog/habitat/tasks/vln/__init__.py b/habitat-lab-dialog/habitat/tasks/vln/__init__.py new file mode 100644 index 0000000..3d5e9f5 --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/vln/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.core.embodied_task import EmbodiedTask +from habitat.core.registry import registry + + +def _try_register_vln_task(): + try: + from habitat.tasks.vln.vln import VLNTask # noqa: F401 + except ImportError as e: + vlntask_import_error = e + + @registry.register_task(name="VLN-v0") + class VLNTaskImportError(EmbodiedTask): + def __init__(self, *args, **kwargs): + raise vlntask_import_error diff --git a/habitat-lab-dialog/habitat/tasks/vln/vln.py b/habitat-lab-dialog/habitat/tasks/vln/vln.py new file mode 100644 index 0000000..eb7b624 --- /dev/null +++ b/habitat-lab-dialog/habitat/tasks/vln/vln.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 + +# Owners/maintainers of the Vision and Language Navigation task: +# @jacobkrantz: Jacob Krantz +# @koshyanand: Anand Koshy + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Dict, List, Optional + +import attr +from gym import spaces + +from habitat.core.registry import registry +from habitat.core.simulator import Observations, Sensor +from habitat.core.utils import not_none_validator +from habitat.tasks.nav.nav import NavigationEpisode, NavigationTask + + +@attr.s(auto_attribs=True) +class InstructionData: + instruction_text: str + instruction_tokens: Optional[List[str]] = None + + +@attr.s(auto_attribs=True, kw_only=True) +class VLNEpisode(NavigationEpisode): + r"""Specification of episode that includes initial position and rotation + of agent, goal specifications, instruction specifications, reference path, + and optional shortest paths. + + Args: + episode_id: id of episode in the dataset + scene_id: id of scene inside the simulator. + start_position: numpy ndarray containing 3 entries for (x, y, z). + start_rotation: numpy ndarray with 4 entries for (x, y, z, w) + elements of unit quaternion (versor) representing agent 3D + orientation. + goals: list of goals specifications + reference_path: List of (x, y, z) positions which gives the reference + path to the goal that aligns with the instruction. + instruction: single natural language instruction guide to goal. + trajectory_id: id of ground truth trajectory path. + """ + reference_path: List[List[float]] = attr.ib( + default=None, validator=not_none_validator + ) + instruction: InstructionData = attr.ib( + default=None, validator=not_none_validator + ) + trajectory_id: int = attr.ib(default=None, validator=not_none_validator) + + +@registry.register_sensor(name="InstructionSensor") +class InstructionSensor(Sensor): + def __init__(self, **kwargs): + self.uuid = "instruction" + self.observation_space = spaces.Discrete(0) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.uuid + + def _get_observation( + self, + observations: Dict[str, Observations], + episode: VLNEpisode, + **kwargs + ): + return { + "text": episode.instruction.instruction_text, + "tokens": episode.instruction.instruction_tokens, + "trajectory_id": episode.trajectory_id, + } + + def get_observation(self, **kwargs): + return self._get_observation(**kwargs) + + +@registry.register_task(name="VLN-v0") +class VLNTask(NavigationTask): + r"""Vision and Language Navigation Task + Goal: An agent must navigate to a goal location in a 3D environment + specified by a natural language instruction. + Metric: Success weighted by Path Length (SPL) + Usage example: + examples/vln_reference_path_follower_example.py + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) diff --git a/habitat-lab-dialog/habitat/utils/__init__.py b/habitat-lab-dialog/habitat/utils/__init__.py new file mode 100644 index 0000000..1b401e1 --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +__all__ = ["visualizations", "geometry_utils"] diff --git a/habitat-lab-dialog/habitat/utils/geometry_utils.py b/habitat-lab-dialog/habitat/utils/geometry_utils.py new file mode 100644 index 0000000..18e20cb --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/geometry_utils.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Tuple, Union + +import numpy as np +import quaternion + +EPSILON = 1e-8 + + +def angle_between_quaternions(q1: np.quaternion, q2: np.quaternion) -> float: + r"""Returns the angle (in radians) between two quaternions. This angle will + always be positive. + """ + q1_inv = np.conjugate(q1) + dq = quaternion.as_float_array(q1_inv * q2) + + return 2 * np.arctan2(np.linalg.norm(dq[1:]), np.abs(dq[0])) + + +def quaternion_from_two_vectors(v0: np.array, v1: np.array) -> np.quaternion: + r"""Computes the quaternion representation of v1 using v0 as the origin.""" + v0 = v0 / np.linalg.norm(v0) + v1 = v1 / np.linalg.norm(v1) + c = v0.dot(v1) + # Epsilon prevents issues at poles. + if c < (-1 + EPSILON): + c = max(c, -1) + m = np.stack([v0, v1], 0) + _, _, vh = np.linalg.svd(m, full_matrices=True) + axis = vh.T[:, 2] + w2 = (1 + c) * 0.5 + w = np.sqrt(w2) + axis = axis * np.sqrt(1 - w2) + return np.quaternion(w, *axis) + + axis = np.cross(v0, v1) + s = np.sqrt((1 + c) * 2) + return np.quaternion(s * 0.5, *(axis / s)) + + +def quaternion_to_list(q: np.quaternion): + return q.imag.tolist() + [q.real] + + +def quaternion_from_coeff(coeffs: np.ndarray) -> np.quaternion: + r"""Creates a quaternions from coeffs in [x, y, z, w] format""" + quat = np.quaternion(0, 0, 0, 0) + quat.real = coeffs[3] + quat.imag = coeffs[0:3] + return quat + + +def quaternion_rotate_vector(quat: np.quaternion, v: np.array) -> np.array: + r"""Rotates a vector by a quaternion + Args: + quaternion: The quaternion to rotate by + v: The vector to rotate + Returns: + np.array: The rotated vector + """ + vq = np.quaternion(0, 0, 0, 0) + vq.imag = v + return (quat * vq * quat.inverse()).imag + + +def agent_state_target2ref( + ref_agent_state: Union[List, Tuple], target_agent_state: Union[List, Tuple] +) -> Tuple[np.quaternion, np.array]: + r"""Computes the target agent_state's rotation and position representation + with respect to the coordinate system defined by reference agent's rotation and position. + All rotations must be in [x, y, z, w] format. + + :param ref_agent_state: reference agent_state in the format of [rotation, position]. + The rotation and position are from a common/global coordinate systems. + They define a local coordinate system. + :param target_agent_state: target agent_state in the format of [rotation, position]. + The rotation and position are from a common/global coordinate systems. + and need to be transformed to the local coordinate system defined by ref_agent_state. + """ + + assert ( + len(ref_agent_state[1]) == 3 + ), "Only support Cartesian format currently." + assert ( + len(target_agent_state[1]) == 3 + ), "Only support Cartesian format currently." + + ref_rotation, ref_position = ref_agent_state + target_rotation, target_position = target_agent_state + + # convert to all rotation representations to np.quaternion + if not isinstance(ref_rotation, np.quaternion): + ref_rotation = quaternion_from_coeff(ref_rotation) + ref_rotation = ref_rotation.normalized() + + if not isinstance(target_rotation, np.quaternion): + target_rotation = quaternion_from_coeff(target_rotation) + target_rotation = target_rotation.normalized() + + rotation_in_ref_coordinate = ref_rotation.inverse() * target_rotation + + position_in_ref_coordinate = quaternion_rotate_vector( + ref_rotation.inverse(), target_position - ref_position + ) + + return (rotation_in_ref_coordinate, position_in_ref_coordinate) diff --git a/habitat-lab-dialog/habitat/utils/pickle5_multiprocessing.py b/habitat-lab-dialog/habitat/utils/pickle5_multiprocessing.py new file mode 100644 index 0000000..32c1a36 --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/pickle5_multiprocessing.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import io +import sys +from multiprocessing.connection import Connection +from multiprocessing.reduction import ForkingPickler as _ForkingPickler + +from habitat.core.logging import logger + +if sys.version_info[:2] < (3, 8): + # pickle 5 backport + try: + import pickle5 as pickle + except ImportError: + import pickle # type: ignore[no-redef] + + logger.warn( + f"""Warning pickle v5 protocol not supported. + Falling back to pickle version {pickle.HIGHEST_PROTOCOL}. + pip install pickle5 or upgrade to Python 3.8 or greater + for faster performance""" + ) + + class ForkingPickler5(pickle.Pickler): + wrapped = _ForkingPickler + loads = staticmethod(pickle.loads) + + @classmethod + def dumps(cls, obj, protocol: int = -1): + buf = io.BytesIO() + cls(buf, protocol).dump(obj) + return buf.getbuffer() + + def __init__(self, file, protocol: int = -1, **kwargs): + super().__init__(file, protocol, **kwargs) + self.dispatch_table = self.wrapped( + file, protocol, **kwargs + ).dispatch_table + + +else: + import pickle + + ForkingPickler5 = _ForkingPickler + + +class ConnectionWrapper(object): + """Proxy class for _multiprocessing.Connection which uses ForkingPickler to + serialize objects. Will use the Pickle5 backport if available.""" + + def __init__(self, conn: Connection): + self.conn: Connection = conn + + def send(self, obj): + self._check_closed() + self._check_writable() + buf = io.BytesIO() + ForkingPickler5(buf, -1).dump(obj) + self.send_bytes(buf.getvalue()) + + def recv(self): + self._check_closed() + self._check_readable() + buf = self.recv_bytes() + return pickle.loads(buf) + + def __getattr__(self, name): + if "conn" in self.__dict__: + return getattr(self.conn, name) + raise AttributeError( + "'{}' object has no attribute '{}'".format( + type(self).__name__, "conn" + ) + ) diff --git a/habitat-lab-dialog/habitat/utils/profiling_wrapper.py b/habitat-lab-dialog/habitat/utils/profiling_wrapper.py new file mode 100644 index 0000000..50e8afc --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/profiling_wrapper.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +r"""Wrappers for habitat_sim profiling_utils functions. The wrappers are no-ops +if habitat_sim isn't installed. + +Example of capturing an Nsight Systems profile with Habitat-lab: +export HABITAT_PROFILING=1 +export NSYS_NVTX_PROFILER_REGISTER_ONLY=0 # required when using capture range +path/to/nvidia/nsight-systems/bin/nsys profile --sample=none --trace=nvtx --trace-fork-before-exec=true --capture-range=nvtx -p "habitat_capture_range" --stop-on-range-end=true --output=my_profile --export=sqlite python habitat_baselines/run.py --exp-config habitat_baselines/config/pointnav/ppo_pointnav.yaml --run-type train PROFILING.CAPTURE_START_STEP 200 PROFILING.NUM_STEPS_TO_CAPTURE 100 +# look for my_profile.qdrep in working directory +""" + +from contextlib import ContextDecorator + +try: + from habitat_sim.utils import profiling_utils +except ImportError: + profiling_utils = None + + +def configure(capture_start_step=-1, num_steps_to_capture=-1): + r"""Wrapper for habitat_sim profiling_utils.configure""" + if profiling_utils: + profiling_utils.configure(capture_start_step, num_steps_to_capture) + + +def on_start_step(): + r"""Wrapper for habitat_sim profiling_utils.on_start_step""" + if profiling_utils: + profiling_utils.on_start_step() + + +def range_push(msg: str): + r"""Wrapper for habitat_sim profiling_utils.range_push""" + if profiling_utils: + profiling_utils.range_push(msg) + + +def range_pop(): + r"""Wrapper for habitat_sim profiling_utils.range_pop""" + if profiling_utils: + profiling_utils.range_pop() + + +class RangeContext(ContextDecorator): + r"""Annotate a range for profiling. Use as a function decorator or in a with + statement. See habitat_sim profiling_utils. + """ + + def __init__(self, msg: str): + self._msg = msg + + def __enter__(self): + range_push(self._msg) + return self + + def __exit__(self, *exc): + range_pop() + return False diff --git a/habitat-lab-dialog/habitat/utils/test_utils.py b/habitat-lab-dialog/habitat/utils/test_utils.py new file mode 100644 index 0000000..47b0935 --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/test_utils.py @@ -0,0 +1,15 @@ +from habitat.tasks.nav.nav import StopAction + + +def sample_non_stop_action(action_space, num_samples=1): + samples = [] + for _ in range(num_samples): + action = action_space.sample() + while action["action"] == StopAction.name: + action = action_space.sample() + samples.append({"action": action}) + + if num_samples == 1: + return samples[0]["action"] + else: + return samples diff --git a/habitat-lab-dialog/habitat/utils/visualizations/__init__.py b/habitat-lab-dialog/habitat/utils/visualizations/__init__.py new file mode 100644 index 0000000..c27ce95 --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/visualizations/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.utils.visualizations import maps, utils + +__all__ = ["maps", "utils"] diff --git a/habitat-lab-dialog/habitat/utils/visualizations/assets/maps_topdown_agent_sprite/100x100.png b/habitat-lab-dialog/habitat/utils/visualizations/assets/maps_topdown_agent_sprite/100x100.png new file mode 100644 index 0000000..94b0f93 Binary files /dev/null and b/habitat-lab-dialog/habitat/utils/visualizations/assets/maps_topdown_agent_sprite/100x100.png differ diff --git a/habitat-lab-dialog/habitat/utils/visualizations/fog_of_war.py b/habitat-lab-dialog/habitat/utils/visualizations/fog_of_war.py new file mode 100644 index 0000000..531376c --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/visualizations/fog_of_war.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numba +import numpy as np + +from habitat.utils.visualizations import maps + + +@numba.jit(nopython=True) +def bresenham_supercover_line(pt1, pt2): + r"""Line drawing algo based + on http://eugen.dedu.free.fr/projects/bresenham/ + """ + + ystep, xstep = 1, 1 + + x, y = pt1 + dx, dy = pt2 - pt1 + + if dy < 0: + ystep *= -1 + dy *= -1 + + if dx < 0: + xstep *= -1 + dx *= -1 + + line_pts = [[x, y]] + + ddx, ddy = 2 * dx, 2 * dy + if ddx > ddy: + errorprev = dx + error = dx + for _ in range(int(dx)): + x += xstep + error += ddy + + if error > ddx: + y += ystep + error -= ddx + if error + errorprev < ddx: + line_pts.append([x, y - ystep]) + elif error + errorprev > ddx: + line_pts.append([x - xstep, y]) + else: + line_pts.append([x - xstep, y]) + line_pts.append([x, y - ystep]) + + line_pts.append([x, y]) + + errorprev = error + else: + errorprev = dx + error = dx + for _ in range(int(dy)): + y += ystep + error += ddx + + if error > ddy: + x += xstep + error -= ddy + if error + errorprev < ddy: + line_pts.append([x - xstep, y]) + elif error + errorprev > ddy: + line_pts.append([x, y - ystep]) + else: + line_pts.append([x - xstep, y]) + line_pts.append([x, y - ystep]) + + line_pts.append([x, y]) + + errorprev = error + + return line_pts + + +@numba.jit(nopython=True) +def draw_fog_of_war_line(top_down_map, fog_of_war_mask, pt1, pt2): + r"""Draws a line on the fog_of_war_mask mask between pt1 and pt2""" + + for pt in bresenham_supercover_line(pt1, pt2): + x, y = pt + + if x < 0 or x >= fog_of_war_mask.shape[0]: + break + + if y < 0 or y >= fog_of_war_mask.shape[1]: + break + + if top_down_map[x, y] == maps.MAP_INVALID_POINT: + break + + fog_of_war_mask[x, y] = 1 + + +@numba.jit(nopython=True) +def _draw_loop( + top_down_map, + fog_of_war_mask, + current_point, + current_angle, + max_line_len, + angles, +): + for angle in angles: + draw_fog_of_war_line( + top_down_map, + fog_of_war_mask, + current_point, + current_point + + max_line_len + * np.array( + [np.cos(current_angle + angle), np.sin(current_angle + angle)] + ), + ) + + +def reveal_fog_of_war( + top_down_map: np.ndarray, + current_fog_of_war_mask: np.ndarray, + current_point: np.ndarray, + current_angle: float, + fov: float = 90, + max_line_len: float = 100, +) -> np.ndarray: + r"""Reveals the fog-of-war at the current location + + This works by simply drawing lines from the agents current location + and stopping once a wall is hit + + Args: + top_down_map: The current top down map. Used for respecting walls when revealing + current_fog_of_war_mask: The current fog-of-war mask to reveal the fog-of-war on + current_point: The current location of the agent on the fog_of_war_mask + current_angle: The current look direction of the agent on the fog_of_war_mask + fov: The feild of view of the agent + max_line_len: The maximum length of the lines used to reveal the fog-of-war + + Returns: + The updated fog_of_war_mask + """ + fov = np.deg2rad(fov) + + # Set the angle step to a value such that delta_angle * max_line_len = 1 + angles = np.arange( + -fov / 2, fov / 2, step=1.0 / max_line_len, dtype=np.float32 + ) + + fog_of_war_mask = current_fog_of_war_mask.copy() + _draw_loop( + top_down_map, + fog_of_war_mask, + current_point, + current_angle, + max_line_len, + angles, + ) + + return fog_of_war_mask diff --git a/habitat-lab-dialog/habitat/utils/visualizations/maps.py b/habitat-lab-dialog/habitat/utils/visualizations/maps.py new file mode 100644 index 0000000..39073d8 --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/visualizations/maps.py @@ -0,0 +1,439 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +from typing import Any, Dict, List, Optional, Sequence, Tuple + +import imageio +import numpy as np +import scipy.ndimage + +from habitat.core.utils import try_cv2_import +from habitat.utils.visualizations import utils + +try: + from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim +except ImportError: + pass + +cv2 = try_cv2_import() + + +AGENT_SPRITE = imageio.imread( + os.path.join( + os.path.dirname(__file__), + "assets", + "maps_topdown_agent_sprite", + "100x100.png", + ) +) +AGENT_SPRITE = np.ascontiguousarray(np.flipud(AGENT_SPRITE)) + +MAP_INVALID_POINT = 0 +MAP_VALID_POINT = 1 +MAP_BORDER_INDICATOR = 2 +MAP_SOURCE_POINT_INDICATOR = 4 +MAP_TARGET_POINT_INDICATOR = 6 +MAP_SHORTEST_PATH_COLOR = 7 +MAP_VIEW_POINT_INDICATOR = 8 +MAP_TARGET_BOUNDING_BOX = 9 +TOP_DOWN_MAP_COLORS = np.full((256, 3), 150, dtype=np.uint8) +TOP_DOWN_MAP_COLORS[10:] = cv2.applyColorMap( + np.arange(246, dtype=np.uint8), cv2.COLORMAP_JET +).squeeze(1)[:, ::-1] +TOP_DOWN_MAP_COLORS[MAP_INVALID_POINT] = [255, 255, 255] # White +TOP_DOWN_MAP_COLORS[MAP_VALID_POINT] = [150, 150, 150] # Light Grey +TOP_DOWN_MAP_COLORS[MAP_BORDER_INDICATOR] = [50, 50, 50] # Grey +TOP_DOWN_MAP_COLORS[MAP_SOURCE_POINT_INDICATOR] = [0, 0, 200] # Blue +TOP_DOWN_MAP_COLORS[MAP_TARGET_POINT_INDICATOR] = [200, 0, 0] # Red +TOP_DOWN_MAP_COLORS[MAP_SHORTEST_PATH_COLOR] = [0, 200, 0] # Green +TOP_DOWN_MAP_COLORS[MAP_VIEW_POINT_INDICATOR] = [245, 150, 150] # Light Red +TOP_DOWN_MAP_COLORS[MAP_TARGET_BOUNDING_BOX] = [0, 175, 0] # Green + + +def draw_agent( + image: np.ndarray, + agent_center_coord: Tuple[int, int], + agent_rotation: float, + agent_radius_px: int = 5, +) -> np.ndarray: + r"""Return an image with the agent image composited onto it. + Args: + image: the image onto which to put the agent. + agent_center_coord: the image coordinates where to paste the agent. + agent_rotation: the agent's current rotation in radians. + agent_radius_px: 1/2 number of pixels the agent will be resized to. + Returns: + The modified background image. This operation is in place. + """ + + # Rotate before resize to keep good resolution. + rotated_agent = scipy.ndimage.interpolation.rotate( + AGENT_SPRITE, agent_rotation * 180 / np.pi + ) + # Rescale because rotation may result in larger image than original, but + # the agent sprite size should stay the same. + initial_agent_size = AGENT_SPRITE.shape[0] + new_size = rotated_agent.shape[0] + agent_size_px = max( + 1, int(agent_radius_px * 2 * new_size / initial_agent_size) + ) + resized_agent = cv2.resize( + rotated_agent, + (agent_size_px, agent_size_px), + interpolation=cv2.INTER_LINEAR, + ) + utils.paste_overlapping_image(image, resized_agent, agent_center_coord) + return image + + +def pointnav_draw_target_birdseye_view( + agent_position: np.ndarray, + agent_heading: float, + goal_position: np.ndarray, + resolution_px: int = 800, + goal_radius: float = 0.2, + agent_radius_px: int = 20, + target_band_radii: Optional[List[float]] = None, + target_band_colors: Optional[List[Tuple[int, int, int]]] = None, +) -> np.ndarray: + r"""Return an image of agent w.r.t. centered target location for pointnav + tasks. + + Args: + agent_position: the agent's current position. + agent_heading: the agent's current rotation in radians. This can be + found using the HeadingSensor. + goal_position: the pointnav task goal position. + resolution_px: number of pixels for the output image width and height. + goal_radius: how near the agent needs to be to be successful for the + pointnav task. + agent_radius_px: 1/2 number of pixels the agent will be resized to. + target_band_radii: distance in meters to the outer-radius of each band + in the target image. + target_band_colors: colors in RGB 0-255 for the bands in the target. + Returns: + Image centered on the goal with the agent's current relative position + and rotation represented by an arrow. To make the rotations align + visually with habitat, positive-z is up, positive-x is left and a + rotation of 0 points upwards in the output image and rotates clockwise. + """ + if target_band_radii is None: + target_band_radii = [20, 10, 5, 2.5, 1] + if target_band_colors is None: + target_band_colors = [ + (47, 19, 122), + (22, 99, 170), + (92, 177, 0), + (226, 169, 0), + (226, 12, 29), + ] + + assert len(target_band_radii) == len( + target_band_colors + ), "There must be an equal number of scales and colors." + + goal_agent_dist = np.linalg.norm(agent_position - goal_position, 2) + + goal_distance_padding = max( + 2, 2 ** np.ceil(np.log(max(1e-6, goal_agent_dist)) / np.log(2)) + ) + movement_scale = 1.0 / goal_distance_padding + half_res = resolution_px // 2 + im_position = np.full( + (resolution_px, resolution_px, 3), 255, dtype=np.uint8 + ) + + # Draw bands: + for scale, color in zip(target_band_radii, target_band_colors): + if goal_distance_padding * 4 > scale: + cv2.circle( + im_position, + (half_res, half_res), + max(2, int(half_res * scale * movement_scale)), + color, + thickness=-1, + ) + + # Draw such that the agent being inside the radius is the circles + # overlapping. + cv2.circle( + im_position, + (half_res, half_res), + max(2, int(half_res * goal_radius * movement_scale)), + (127, 0, 0), + thickness=-1, + ) + + relative_position = agent_position - goal_position + # swap x and z, remove y for (x,y,z) -> image coordinates. + relative_position = relative_position[[2, 0]] + relative_position *= half_res * movement_scale + relative_position += half_res + relative_position = np.round(relative_position).astype(np.int32) + + # Draw the agent + draw_agent(im_position, relative_position, agent_heading, agent_radius_px) + + # Rotate twice to fix coordinate system to upwards being positive-z. + # Rotate instead of flip to keep agent rotations in sync with egocentric + # view. + im_position = np.rot90(im_position, 2) + return im_position + + +def to_grid( + realworld_x: float, + realworld_y: float, + grid_resolution: Tuple[int, int], + sim: Optional["HabitatSim"] = None, + pathfinder=None, +) -> Tuple[int, int]: + r"""Return gridworld index of realworld coordinates assuming top-left corner + is the origin. The real world coordinates of lower left corner are + (coordinate_min, coordinate_min) and of top right corner are + (coordinate_max, coordinate_max) + """ + if sim is None and pathfinder is None: + raise RuntimeError( + "Must provide either a simulator or pathfinder instance" + ) + + if pathfinder is None: + pathfinder = sim.pathfinder + + lower_bound, upper_bound = pathfinder.get_bounds() + + grid_size = ( + abs(upper_bound[2] - lower_bound[2]) / grid_resolution[0], + abs(upper_bound[0] - lower_bound[0]) / grid_resolution[1], + ) + grid_x = int((realworld_x - lower_bound[2]) / grid_size[0]) + grid_y = int((realworld_y - lower_bound[0]) / grid_size[1]) + return grid_x, grid_y + + +def from_grid( + grid_x: int, + grid_y: int, + grid_resolution: Tuple[int, int], + sim: Optional["HabitatSim"] = None, + pathfinder=None, +) -> Tuple[float, float]: + r"""Inverse of _to_grid function. Return real world coordinate from + gridworld assuming top-left corner is the origin. The real world + coordinates of lower left corner are (coordinate_min, coordinate_min) and + of top right corner are (coordinate_max, coordinate_max) + """ + + if sim is None and pathfinder is None: + raise RuntimeError( + "Must provide either a simulator or pathfinder instance" + ) + + if pathfinder is None: + pathfinder = sim.pathfinder + + lower_bound, upper_bound = pathfinder.get_bounds() + + grid_size = ( + abs(upper_bound[2] - lower_bound[2]) / grid_resolution[0], + abs(upper_bound[0] - lower_bound[0]) / grid_resolution[1], + ) + realworld_x = lower_bound[2] + grid_x * grid_size[0] + realworld_y = lower_bound[0] + grid_y * grid_size[1] + return realworld_x, realworld_y + + +def _outline_border(top_down_map): + left_right_block_nav = (top_down_map[:, :-1] == 1) & ( + top_down_map[:, :-1] != top_down_map[:, 1:] + ) + left_right_nav_block = (top_down_map[:, 1:] == 1) & ( + top_down_map[:, :-1] != top_down_map[:, 1:] + ) + + up_down_block_nav = (top_down_map[:-1] == 1) & ( + top_down_map[:-1] != top_down_map[1:] + ) + up_down_nav_block = (top_down_map[1:] == 1) & ( + top_down_map[:-1] != top_down_map[1:] + ) + + top_down_map[:, :-1][left_right_block_nav] = MAP_BORDER_INDICATOR + top_down_map[:, 1:][left_right_nav_block] = MAP_BORDER_INDICATOR + + top_down_map[:-1][up_down_block_nav] = MAP_BORDER_INDICATOR + top_down_map[1:][up_down_nav_block] = MAP_BORDER_INDICATOR + + +def calculate_meters_per_pixel( + map_resolution: int, sim: Optional["HabitatSim"] = None, pathfinder=None +): + r"""Calculate the meters_per_pixel for a given map resolution""" + if sim is None and pathfinder is None: + raise RuntimeError( + "Must provide either a simulator or pathfinder instance" + ) + + if pathfinder is None: + pathfinder = sim.pathfinder + + lower_bound, upper_bound = pathfinder.get_bounds() + return min( + abs(upper_bound[coord] - lower_bound[coord]) / map_resolution + for coord in [0, 2] + ) + + +def get_topdown_map( + pathfinder, + height: float, + map_resolution: int = 1024, + draw_border: bool = True, + meters_per_pixel: Optional[float] = None, +) -> np.ndarray: + r"""Return a top-down occupancy map for a sim. Note, this only returns valid + values for whatever floor the agent is currently on. + + :param pathfinder: A habitat-sim pathfinder instances to get the map from + :param height: The height in the environment to make the topdown map + :param map_resolution: Length of the longest side of the map. Used to calculate :p:`meters_per_pixel` + :param draw_border: Whether or not to draw a border + :param meters_per_pixel: Overrides map_resolution an + + :return: Image containing 0 if occupied, 1 if unoccupied, and 2 if border (if + the flag is set). + """ + + if meters_per_pixel is None: + meters_per_pixel = calculate_meters_per_pixel( + map_resolution, pathfinder=pathfinder + ) + + top_down_map = pathfinder.get_topdown_view( + meters_per_pixel=meters_per_pixel, height=height + ).astype(np.uint8) + + # Draw border if necessary + if draw_border: + _outline_border(top_down_map) + + return np.ascontiguousarray(top_down_map) + + +def get_topdown_map_from_sim( + sim: "HabitatSim", + map_resolution: int = 1024, + draw_border: bool = True, + meters_per_pixel: Optional[float] = None, + agent_id: int = 0, +) -> np.ndarray: + r"""Wrapper around :py:`get_topdown_map` that retrieves that pathfinder and heigh from the current simulator + + :param sim: Simulator instance. + :param agent_id: The agent ID + """ + return get_topdown_map( + sim.pathfinder, + sim.get_agent(agent_id).state.position[1], + map_resolution, + draw_border, + meters_per_pixel, + ) + + +def colorize_topdown_map( + top_down_map: np.ndarray, + fog_of_war_mask: Optional[np.ndarray] = None, + fog_of_war_desat_amount: float = 0.5, +) -> np.ndarray: + r"""Convert the top down map to RGB based on the indicator values. + Args: + top_down_map: A non-colored version of the map. + fog_of_war_mask: A mask used to determine which parts of the + top_down_map are visible + Non-visible parts will be desaturated + fog_of_war_desat_amount: Amount to desaturate the color of unexplored areas + Decreasing this value will make unexplored areas darker + Default: 0.5 + Returns: + A colored version of the top-down map. + """ + _map = TOP_DOWN_MAP_COLORS[top_down_map] + + if fog_of_war_mask is not None: + fog_of_war_desat_values = np.array([[fog_of_war_desat_amount], [1.0]]) + # Only desaturate things that are valid points as only valid points get revealed + desat_mask = top_down_map != MAP_INVALID_POINT + + _map[desat_mask] = ( + _map * fog_of_war_desat_values[fog_of_war_mask] + ).astype(np.uint8)[desat_mask] + + return _map + + +def draw_path( + top_down_map: np.ndarray, + path_points: Sequence[Tuple], + color: int = 10, + thickness: int = 2, +) -> None: + r"""Draw path on top_down_map (in place) with specified color. + Args: + top_down_map: A colored version of the map. + color: color code of the path, from TOP_DOWN_MAP_COLORS. + path_points: list of points that specify the path to be drawn + thickness: thickness of the path. + """ + for prev_pt, next_pt in zip(path_points[:-1], path_points[1:]): + # Swapping x y + cv2.line( + top_down_map, + prev_pt[::-1], + next_pt[::-1], + color, + thickness=thickness, + ) + + +def colorize_draw_agent_and_fit_to_height( + topdown_map_info: Dict[str, Any], output_height: int +): + r"""Given the output of the TopDownMap measure, colorizes the map, draws the agent, + and fits to a desired output height + + :param topdown_map_info: The output of the TopDownMap measure + :param output_height: The desired output height + """ + top_down_map = topdown_map_info["map"] + top_down_map = colorize_topdown_map( + top_down_map, topdown_map_info["fog_of_war_mask"] + ) + map_agent_pos = topdown_map_info["agent_map_coord"] + top_down_map = draw_agent( + image=top_down_map, + agent_center_coord=map_agent_pos, + agent_rotation=topdown_map_info["agent_angle"], + agent_radius_px=min(top_down_map.shape[0:2]) // 32, + ) + + if top_down_map.shape[0] > top_down_map.shape[1]: + top_down_map = np.rot90(top_down_map, 1) + + # scale top down map to align with rgb view + old_h, old_w, _ = top_down_map.shape + top_down_height = output_height + top_down_width = int(float(top_down_height) / old_h * old_w) + # cv2 resize (dsize is width first) + top_down_map = cv2.resize( + top_down_map, + (top_down_width, top_down_height), + interpolation=cv2.INTER_CUBIC, + ) + + return top_down_map diff --git a/habitat-lab-dialog/habitat/utils/visualizations/utils.py b/habitat-lab-dialog/habitat/utils/visualizations/utils.py new file mode 100644 index 0000000..a410b7a --- /dev/null +++ b/habitat-lab-dialog/habitat/utils/visualizations/utils.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import textwrap +from typing import Dict, List, Optional, Tuple + +import imageio +import numpy as np +import tqdm + +from habitat.core.logging import logger +from habitat.core.utils import try_cv2_import +from habitat.utils.visualizations import maps + +cv2 = try_cv2_import() + + +def paste_overlapping_image( + background: np.ndarray, + foreground: np.ndarray, + location: Tuple[int, int], + mask: Optional[np.ndarray] = None, +): + r"""Composites the foreground onto the background dealing with edge + boundaries. + Args: + background: the background image to paste on. + foreground: the image to paste. Can be RGB or RGBA. If using alpha + blending, values for foreground and background should both be + between 0 and 255. Otherwise behavior is undefined. + location: the image coordinates to paste the foreground. + mask: If not None, a mask for deciding what part of the foreground to + use. Must be the same size as the foreground if provided. + Returns: + The modified background image. This operation is in place. + """ + assert mask is None or mask.shape[:2] == foreground.shape[:2] + foreground_size = foreground.shape[:2] + min_pad = ( + max(0, foreground_size[0] // 2 - location[0]), + max(0, foreground_size[1] // 2 - location[1]), + ) + + max_pad = ( + max( + 0, + (location[0] + (foreground_size[0] - foreground_size[0] // 2)) + - background.shape[0], + ), + max( + 0, + (location[1] + (foreground_size[1] - foreground_size[1] // 2)) + - background.shape[1], + ), + ) + + background_patch = background[ + (location[0] - foreground_size[0] // 2 + min_pad[0]) : ( + location[0] + + (foreground_size[0] - foreground_size[0] // 2) + - max_pad[0] + ), + (location[1] - foreground_size[1] // 2 + min_pad[1]) : ( + location[1] + + (foreground_size[1] - foreground_size[1] // 2) + - max_pad[1] + ), + ] + foreground = foreground[ + min_pad[0] : foreground.shape[0] - max_pad[0], + min_pad[1] : foreground.shape[1] - max_pad[1], + ] + if foreground.size == 0 or background_patch.size == 0: + # Nothing to do, no overlap. + return background + + if mask is not None: + mask = mask[ + min_pad[0] : foreground.shape[0] - max_pad[0], + min_pad[1] : foreground.shape[1] - max_pad[1], + ] + + if foreground.shape[2] == 4: + # Alpha blending + foreground = ( + background_patch.astype(np.int32) * (255 - foreground[:, :, [3]]) + + foreground[:, :, :3].astype(np.int32) * foreground[:, :, [3]] + ) // 255 + if mask is not None: + background_patch[mask] = foreground[mask] + else: + background_patch[:] = foreground + return background + + +def images_to_video( + images: List[np.ndarray], + output_dir: str, + video_name: str, + fps: int = 10, + quality: Optional[float] = 5, + **kwargs, +): + r"""Calls imageio to run FFMPEG on a list of images. For more info on + parameters, see https://imageio.readthedocs.io/en/stable/format_ffmpeg.html + Args: + images: The list of images. Images should be HxWx3 in RGB order. + output_dir: The folder to put the video in. + video_name: The name for the video. + fps: Frames per second for the video. Not all values work with FFMPEG, + use at your own risk. + quality: Default is 5. Uses variable bit rate. Highest quality is 10, + lowest is 0. Set to None to prevent variable bitrate flags to + FFMPEG so you can manually specify them using output_params + instead. Specifying a fixed bitrate using ‘bitrate’ disables + this parameter. + """ + assert 0 <= quality <= 10 + if not os.path.exists(output_dir): + os.makedirs(output_dir) + video_name = video_name.replace(" ", "_").replace("\n", "_") + ".mp4" + writer = imageio.get_writer( + os.path.join(output_dir, video_name), + fps=fps, + quality=quality, + **kwargs, + ) + logger.info(f"Video created: {os.path.join(output_dir, video_name)}") + for im in tqdm.tqdm(images): + writer.append_data(im) + writer.close() + + +def draw_collision(view: np.ndarray, alpha: float = 0.4) -> np.ndarray: + r"""Draw translucent red strips on the border of input view to indicate + a collision has taken place. + Args: + view: input view of size HxWx3 in RGB order. + alpha: Opacity of red collision strip. 1 is completely non-transparent. + Returns: + A view with collision effect drawn. + """ + strip_width = view.shape[0] // 20 + mask = np.ones(view.shape) + mask[strip_width:-strip_width, strip_width:-strip_width] = 0 + mask = mask == 1 + view[mask] = (alpha * np.array([255, 0, 0]) + (1.0 - alpha) * view)[mask] + return view + + +def observations_to_image(observation: Dict, info: Dict) -> np.ndarray: + r"""Generate image of single frame from observation and info + returned from a single environment step(). + + Args: + observation: observation returned from an environment step(). + info: info returned from an environment step(). + + Returns: + generated image of a single frame. + """ + egocentric_view_l: List[np.ndarray] = [] + if "rgb" in observation: + rgb = observation["rgb"] + if not isinstance(rgb, np.ndarray): + rgb = rgb.cpu().numpy() + + egocentric_view_l.append(rgb) + + # draw depth map if observation has depth info + if "depth" in observation: + depth_map = observation["depth"].squeeze() * 255.0 + if not isinstance(depth_map, np.ndarray): + depth_map = depth_map.cpu().numpy() + + depth_map = depth_map.astype(np.uint8) + depth_map = np.stack([depth_map for _ in range(3)], axis=2) + egocentric_view_l.append(depth_map) + + # add image goal if observation has image_goal info + if "imagegoal" in observation: + rgb = observation["imagegoal"] + if not isinstance(rgb, np.ndarray): + rgb = rgb.cpu().numpy() + + egocentric_view_l.append(rgb) + + assert ( + len(egocentric_view_l) > 0 + ), "Expected at least one visual sensor enabled." + egocentric_view = np.concatenate(egocentric_view_l, axis=1) + + # draw collision + if "collisions" in info and info["collisions"]["is_collision"]: + egocentric_view = draw_collision(egocentric_view) + + frame = egocentric_view + + if "top_down_map" in info: + top_down_map = maps.colorize_draw_agent_and_fit_to_height( + info["top_down_map"], egocentric_view.shape[0] + ) + frame = np.concatenate((egocentric_view, top_down_map), axis=1) + return frame + + +def append_text_to_image(image: np.ndarray, text: str): + r"""Appends text underneath an image of size (height, width, channels). + The returned image has white text on a black background. Uses textwrap to + split long text into multiple lines. + Args: + image: the image to put text underneath + text: a string to display + Returns: + A new image with text inserted underneath the input image + """ + h, w, c = image.shape + font_size = 0.5 + font_thickness = 1 + font = cv2.FONT_HERSHEY_SIMPLEX + blank_image = np.zeros(image.shape, dtype=np.uint8) + + char_size = cv2.getTextSize(" ", font, font_size, font_thickness)[0] + wrapped_text = textwrap.wrap(text, width=int(w / char_size[0])) + + y = 0 + for line in wrapped_text: + textsize = cv2.getTextSize(line, font, font_size, font_thickness)[0] + y += textsize[1] + 10 + x = 10 + cv2.putText( + blank_image, + line, + (x, y), + font, + font_size, + (255, 255, 255), + font_thickness, + lineType=cv2.LINE_AA, + ) + text_image = blank_image[0 : y + 10, 0:w] + final = np.concatenate((image, text_image), axis=0) + return final diff --git a/habitat-lab-dialog/habitat/version.py b/habitat-lab-dialog/habitat/version.py new file mode 100644 index 0000000..70916e2 --- /dev/null +++ b/habitat-lab-dialog/habitat/version.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +VERSION = "0.1.7" diff --git a/habitat-lab-dialog/habitat_baselines/README.md b/habitat-lab-dialog/habitat_baselines/README.md new file mode 100644 index 0000000..d79192f --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/README.md @@ -0,0 +1,62 @@ +baselines +============================== +### Installation + +The `habitat_baselines` sub-package is NOT included upon installation by default. To install `habitat_baselines`, use the following command instead: +```bash +pip install -r requirements.txt +python setup.py develop --all +``` +This will also install additional requirements for each sub-module in `habitat_baselines/`, which are specified in `requirements.txt` files located in the sub-module directory. + + +### Reinforcement Learning (RL) + +**Proximal Policy Optimization (PPO)** + +**paper**: [https://arxiv.org/abs/1707.06347](https://arxiv.org/abs/1707.06347) + +**code**: The PPO implementation is based on +[pytorch-a2c-ppo-acktr](https://github.com/ikostrikov/pytorch-a2c-ppo-acktr). + +**dependencies**: A recent version of pytorch, for installing refer to [pytorch.org](https://pytorch.org/) + +For training on sample data please follow steps in the repository README. You should download the sample [test scene data](http://dl.fbaipublicfiles.com/habitat/habitat-test-scenes.zip), extract it under the main repo (`habitat-lab/`, extraction will create a data folder at `habitat-lab/data`) and run the below training command. + +**train**: +```bash +python -u habitat_baselines/run.py --exp-config habitat_baselines/config/pointnav/ppo_pointnav_example.yaml --run-type train +``` + +**test**: +```bash +python -u habitat_baselines/run.py --exp-config habitat_baselines/config/pointnav/ppo_pointnav_example.yaml --run-type eval +``` + +We also provide trained RGB, RGBD, and Depth PPO models for MatterPort3D and Gibson. +To use them download pre-trained pytorch models from [link](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/habitat_baselines_v2.zip) and unzip and specify model path [here](agents/ppo_agents.py#L149). + +The `habitat_baselines/config/pointnav/ppo_pointnav.yaml` config has better hyperparamters for large scale training and loads the [Gibson PointGoal Navigation Dataset](/README.md#task-datasets) instead of the test scenes. +Change the field `task_config` in `habitat_baselines/config/pointnav/ppo_pointnav.yaml` to `configs/tasks/pointnav_mp3d.yaml` for training on [MatterPort3D PointGoal Navigation Dataset](/README.md#task-datasets). + +### Classic + +**SLAM based** + +- [Handcrafted agent baseline](slambased/README.md) adopted from the paper +"Benchmarking Classic and Learned Navigation in Complex 3D Environments". +### Additional Utilities + +**Episode iterator options**: +Coming very soon + +**Tensorboard and video generation support** + +Enable tensorboard by changing `tensorboard_dir` field in `habitat_baselines/config/pointnav/ppo_pointnav.yaml`. + +Enable video generation for `eval` mode by changing `video_option`: `tensorboard,disk` (for displaying on tensorboard and for saving videos on disk, respectively) + +Generated navigation episode recordings should look like this on tensorboard: +

+ +

diff --git a/habitat-lab-dialog/habitat_baselines/__init__.py b/habitat-lab-dialog/habitat_baselines/__init__.py new file mode 100644 index 0000000..5157107 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/__init__.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat_baselines.common.base_il_trainer import BaseILTrainer +from habitat_baselines.common.base_trainer import BaseRLTrainer, BaseTrainer +from habitat_baselines.il.trainers.eqa_cnn_pretrain_trainer import ( + EQACNNPretrainTrainer, +) +from habitat_baselines.il.trainers.pacman_trainer import PACMANTrainer +from habitat_baselines.il.trainers.vqa_trainer import VQATrainer +from habitat_baselines.rl.ppo.ppo_trainer import PPOTrainer, RolloutStorage + +__all__ = [ + "BaseTrainer", + "BaseRLTrainer", + "BaseILTrainer", + "PPOTrainer", + "RolloutStorage", + "EQACNNPretrainTrainer", + "PACMANTrainer", + "VQATrainer", +] diff --git a/habitat-lab-dialog/habitat_baselines/agents/__init__.py b/habitat-lab-dialog/habitat_baselines/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/habitat-lab-dialog/habitat_baselines/agents/ppo_agents.py b/habitat-lab-dialog/habitat_baselines/agents/ppo_agents.py new file mode 100644 index 0000000..1d02563 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/agents/ppo_agents.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse +import random +from typing import Dict, Optional + +import numpy as np +import torch +from gym.spaces import Box +from gym.spaces import Dict as SpaceDict +from gym.spaces import Discrete + +import habitat +from habitat.config import Config +from habitat.core.agent import Agent +from habitat.core.simulator import Observations +from habitat_baselines.rl.ddppo.policy import PointNavResNetPolicy +from habitat_baselines.utils.common import batch_obs + + +def get_default_config() -> Config: + c = Config() + c.INPUT_TYPE = "rgb" + c.MODEL_PATH = "data/checkpoints/gibson-rgb-best.pth" + c.RESOLUTION = 256 + c.HIDDEN_SIZE = 512 + c.RANDOM_SEED = 7 + c.PTH_GPU_ID = 0 + c.GOAL_SENSOR_UUID = "pointgoal_with_gps_compass" + return c + + +class PPOAgent(Agent): + def __init__(self, config: Config) -> None: + spaces = { + get_default_config().GOAL_SENSOR_UUID: Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=(2,), + dtype=np.float32, + ) + } + + if config.INPUT_TYPE in ["depth", "rgbd"]: + spaces["depth"] = Box( + low=0, + high=1, + shape=(config.RESOLUTION, config.RESOLUTION, 1), + dtype=np.float32, + ) + + if config.INPUT_TYPE in ["rgb", "rgbd"]: + spaces["rgb"] = Box( + low=0, + high=255, + shape=(config.RESOLUTION, config.RESOLUTION, 3), + dtype=np.uint8, + ) + observation_spaces = SpaceDict(spaces) + + action_spaces = Discrete(4) + + self.device = ( + torch.device("cuda:{}".format(config.PTH_GPU_ID)) + if torch.cuda.is_available() + else torch.device("cpu") + ) + self.hidden_size = config.HIDDEN_SIZE + + random.seed(config.RANDOM_SEED) + torch.random.manual_seed(config.RANDOM_SEED) + if torch.cuda.is_available(): + torch.backends.cudnn.deterministic = True # type: ignore + + self.actor_critic = PointNavResNetPolicy( + observation_space=observation_spaces, + action_space=action_spaces, + hidden_size=self.hidden_size, + normalize_visual_inputs="rgb" in spaces, + ) + self.actor_critic.to(self.device) + + if config.MODEL_PATH: + ckpt = torch.load(config.MODEL_PATH, map_location=self.device) + # Filter only actor_critic weights + self.actor_critic.load_state_dict( + { + k[len("actor_critic.") :]: v + for k, v in ckpt["state_dict"].items() + if "actor_critic" in k + } + ) + + else: + habitat.logger.error( + "Model checkpoint wasn't loaded, evaluating " "a random model." + ) + + self.test_recurrent_hidden_states: Optional[torch.Tensor] = None + self.not_done_masks: Optional[torch.Tensor] = None + self.prev_actions: Optional[torch.Tensor] = None + + def reset(self) -> None: + self.test_recurrent_hidden_states = torch.zeros( + 1, + self.actor_critic.net.num_recurrent_layers, + self.hidden_size, + device=self.device, + ) + self.not_done_masks = torch.zeros( + 1, 1, device=self.device, dtype=torch.bool + ) + self.prev_actions = torch.zeros( + 1, 1, dtype=torch.long, device=self.device + ) + + def act(self, observations: Observations) -> Dict[str, int]: + batch = batch_obs([observations], device=self.device) + with torch.no_grad(): + ( + _, + actions, + _, + self.test_recurrent_hidden_states, + ) = self.actor_critic.act( + batch, + self.test_recurrent_hidden_states, + self.prev_actions, + self.not_done_masks, + deterministic=False, + ) + # Make masks not done till reset (end of episode) will be called + self.not_done_masks.fill_(True) + self.prev_actions.copy_(actions) # type: ignore + + return {"action": actions[0][0].item()} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input-type", + default="rgb", + choices=["blind", "rgb", "depth", "rgbd"], + ) + parser.add_argument("--model-path", type=str, default=None) + parser.add_argument( + "--task-config", type=str, default="configs/tasks/pointnav.yaml" + ) + args = parser.parse_args() + + agent_config = get_default_config() + agent_config.INPUT_TYPE = args.input_type + if args.model_path is not None: + agent_config.MODEL_PATH = args.model_path + + agent = PPOAgent(agent_config) + benchmark = habitat.Benchmark(config_paths=args.task_config) + metrics = benchmark.evaluate(agent) + + for k, v in metrics.items(): + habitat.logger.info("{}: {:.3f}".format(k, v)) + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/habitat_baselines/agents/simple_agents.py b/habitat-lab-dialog/habitat_baselines/agents/simple_agents.py new file mode 100644 index 0000000..a0f28e2 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/agents/simple_agents.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse +from math import pi +from typing import Dict, Union + +import numpy as np +from numpy import bool_, int64, ndarray + +import habitat +from habitat.config.default import get_config +from habitat.core.simulator import Observations +from habitat.sims.habitat_simulator.actions import HabitatSimActions + + +class RandomAgent(habitat.Agent): + def __init__(self, success_distance: float, goal_sensor_uuid: str) -> None: + self.dist_threshold_to_stop = success_distance + self.goal_sensor_uuid = goal_sensor_uuid + + def reset(self) -> None: + pass + + def is_goal_reached(self, observations: Observations) -> bool_: + dist = observations[self.goal_sensor_uuid][0] + return dist <= self.dist_threshold_to_stop + + def act(self, observations: Observations) -> Dict[str, int64]: + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + action = np.random.choice( + [ + HabitatSimActions.MOVE_FORWARD, + HabitatSimActions.TURN_LEFT, + HabitatSimActions.TURN_RIGHT, + ] + ) + return {"action": action} + + +class ForwardOnlyAgent(RandomAgent): + def act(self, observations: Observations) -> Dict[str, int]: + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + action = HabitatSimActions.MOVE_FORWARD + return {"action": action} + + +class RandomForwardAgent(RandomAgent): + def __init__(self, success_distance: float, goal_sensor_uuid: str) -> None: + super().__init__(success_distance, goal_sensor_uuid) + self.FORWARD_PROBABILITY = 0.8 + + def act(self, observations: Observations) -> Dict[str, Union[int, int64]]: + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + if np.random.uniform(0, 1, 1) < self.FORWARD_PROBABILITY: + action = HabitatSimActions.MOVE_FORWARD + else: + action = np.random.choice( + [HabitatSimActions.TURN_LEFT, HabitatSimActions.TURN_RIGHT] + ) + + return {"action": action} + + +class GoalFollower(RandomAgent): + def __init__(self, success_distance: float, goal_sensor_uuid: str) -> None: + super().__init__(success_distance, goal_sensor_uuid) + self.pos_th = self.dist_threshold_to_stop + self.angle_th = float(np.deg2rad(15)) + self.random_prob = 0 + + def normalize_angle(self, angle: ndarray) -> ndarray: + if angle < -pi: + angle = 2.0 * pi + angle + if angle > pi: + angle = -2.0 * pi + angle + return angle + + def turn_towards_goal(self, angle_to_goal: ndarray) -> int: + if angle_to_goal > pi or ( + (angle_to_goal < 0) and (angle_to_goal > -pi) + ): + action = HabitatSimActions.TURN_RIGHT + else: + action = HabitatSimActions.TURN_LEFT + return action + + def act(self, observations: Observations) -> Dict[str, int]: + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + angle_to_goal = self.normalize_angle( + np.array(observations[self.goal_sensor_uuid][1]) + ) + if abs(angle_to_goal) < self.angle_th: + action = HabitatSimActions.MOVE_FORWARD + else: + action = self.turn_towards_goal(angle_to_goal) + + return {"action": action} + + +def get_all_subclasses(cls): + return set(cls.__subclasses__()).union( + [s for c in cls.__subclasses__() for s in get_all_subclasses(c)] + ) + + +def get_agent_cls(agent_class_name): + sub_classes = [ + sub_class + for sub_class in get_all_subclasses(habitat.Agent) + if sub_class.__name__ == agent_class_name + ] + return sub_classes[0] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--success-distance", type=float, default=0.2) + parser.add_argument( + "--task-config", type=str, default="configs/tasks/pointnav.yaml" + ) + parser.add_argument("--agent-class", type=str, default="GoalFollower") + args = parser.parse_args() + + config = get_config(args.task_config) + + agent = get_agent_cls(args.agent_class)( + success_distance=args.success_distance, + goal_sensor_uuid=config.TASK.GOAL_SENSOR_UUID, + ) + benchmark = habitat.Benchmark(config_paths=args.task_config) + metrics = benchmark.evaluate(agent) + + for k, v in metrics.items(): + habitat.logger.info("{}: {:.3f}".format(k, v)) + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/habitat_baselines/agents/slam_agents.py b/habitat-lab-dialog/habitat_baselines/agents/slam_agents.py new file mode 100644 index 0000000..5c75faf --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/agents/slam_agents.py @@ -0,0 +1,634 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +# type: ignore + +import argparse +import os +import random +import sys +import time +from math import pi + +import numpy as np +import orbslam2 +import PIL +import requests +import torch +from torch.nn import functional as F + +import habitat +from habitat.config.default import get_config +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat_baselines.config.default import get_config as cfg_baseline +from habitat_baselines.slambased.mappers import DirectDepthMapper +from habitat_baselines.slambased.monodepth import MonoDepthEstimator +from habitat_baselines.slambased.path_planners import DifferentiableStarPlanner +from habitat_baselines.slambased.reprojection import ( + angle_to_pi_2_minus_pi_2 as norm_ang, +) +from habitat_baselines.slambased.reprojection import ( + get_direction, + get_distance, + habitat_goalpos_to_mapgoal_pos, + homogenize_p, + planned_path2tps, + project_tps_into_worldmap, +) +from habitat_baselines.slambased.utils import generate_2dgrid + +GOAL_SENSOR_UUID = "pointgoal_with_gps_compass" + + +def download(url, filename): + with open(filename, "wb") as f: + response = requests.get(url, stream=True) + total = response.headers.get("content-length") + if total is None: + f.write(response.content) + else: + downloaded = 0 + total = int(total) + for data in response.iter_content( + chunk_size=max(int(total / 1000), 1024 * 1024) + ): + downloaded += len(data) + f.write(data) + done = int(50 * downloaded / total) + sys.stdout.write( + "\r[{}{}]".format("█" * done, "." * (50 - done)) + ) + sys.stdout.flush() + sys.stdout.write("\n") + + +def ResizePIL2(np_img, size=256): + im1 = PIL.Image.fromarray(np_img) + return np.array(im1.resize((size, size))) + + +def make_good_config_for_orbslam2(config): + config.SIMULATOR.AGENT_0.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] + config.SIMULATOR.RGB_SENSOR.WIDTH = 256 + config.SIMULATOR.RGB_SENSOR.HEIGHT = 256 + config.SIMULATOR.DEPTH_SENSOR.WIDTH = 256 + config.SIMULATOR.DEPTH_SENSOR.HEIGHT = 256 + config.TRAINER.ORBSLAM2.CAMERA_HEIGHT = ( + config.SIMULATOR.DEPTH_SENSOR.POSITION[1] + ) + config.TRAINER.ORBSLAM2.H_OBSTACLE_MIN = ( + 0.3 * config.TRAINER.ORBSLAM2.CAMERA_HEIGHT + ) + config.TRAINER.ORBSLAM2.H_OBSTACLE_MAX = ( + 1.0 * config.TRAINER.ORBSLAM2.CAMERA_HEIGHT + ) + config.TRAINER.ORBSLAM2.MIN_PTS_IN_OBSTACLE = ( + config.SIMULATOR.DEPTH_SENSOR.WIDTH / 2.0 + ) + return + + +class RandomAgent(object): + r"""Simplest agent, which returns random actions, + until reach the goal + """ + + def __init__(self, config): + super(RandomAgent, self).__init__() + self.num_actions = config.NUM_ACTIONS + self.dist_threshold_to_stop = config.DIST_TO_STOP + self.reset() + return + + def reset(self): + self.steps = 0 + return + + def update_internal_state(self, habitat_observation): + self.obs = habitat_observation + self.steps += 1 + return + + def is_goal_reached(self): + dist = self.obs[GOAL_SENSOR_UUID][0] + return dist <= self.dist_threshold_to_stop + + def act(self, habitat_observation=None, random_prob=1.0): + self.update_internal_state(habitat_observation) + # Act + # Check if we are done + if self.is_goal_reached(): + action = HabitatSimActions.STOP + else: + action = random.randint(0, self.num_actions - 1) + return {"action": action} + + +class BlindAgent(RandomAgent): + def __init__(self, config): + super(BlindAgent, self).__init__(config) + self.pos_th = config.DIST_TO_STOP + self.angle_th = config.ANGLE_TH + self.reset() + return + + def decide_what_to_do(self): + distance_to_goal = self.obs[GOAL_SENSOR_UUID][0] + angle_to_goal = norm_ang(np.array(self.obs[GOAL_SENSOR_UUID][1])) + command = HabitatSimActions.STOP + if distance_to_goal <= self.pos_th: + return command + if abs(angle_to_goal) < self.angle_th: + command = HabitatSimActions.MOVE_FORWARD + else: + if (angle_to_goal > 0) and (angle_to_goal < pi): + command = HabitatSimActions.TURN_LEFT + elif angle_to_goal > pi: + command = HabitatSimActions.TURN_RIGHT + elif (angle_to_goal < 0) and (angle_to_goal > -pi): + command = HabitatSimActions.TURN_RIGHT + else: + command = HabitatSimActions.TURN_LEFT + + return command + + def act(self, habitat_observation=None, random_prob=0.1): + self.update_internal_state(habitat_observation) + # Act + if self.is_goal_reached(): + return HabitatSimActions.STOP + command = self.decide_what_to_do() + random_action = random.randint(0, self.num_actions - 1) + act_randomly = np.random.uniform(0, 1, 1) < random_prob + if act_randomly: + action = random_action + else: + action = command + return {"action": action} + + +class ORBSLAM2Agent(RandomAgent): + def __init__(self, config, device=torch.device("cuda:0")): # noqa: B008 + super(ORBSLAM2Agent, self).__init__(config) + self.num_actions = config.NUM_ACTIONS + self.dist_threshold_to_stop = config.DIST_TO_STOP + self.slam_vocab_path = config.SLAM_VOCAB_PATH + assert os.path.isfile(self.slam_vocab_path) + self.slam_settings_path = config.SLAM_SETTINGS_PATH + assert os.path.isfile(self.slam_settings_path) + self.slam = orbslam2.System( + self.slam_vocab_path, self.slam_settings_path, orbslam2.Sensor.RGBD + ) + self.slam.set_use_viewer(False) + self.slam.initialize() + self.device = device + self.map_size_meters = config.MAP_SIZE + self.map_cell_size = config.MAP_CELL_SIZE + self.pos_th = config.DIST_REACHED_TH + self.next_wp_th = config.NEXT_WAYPOINT_TH + self.angle_th = config.ANGLE_TH + self.obstacle_th = config.MIN_PTS_IN_OBSTACLE + self.depth_denorm = config.DEPTH_DENORM + self.planned_waypoints = [] + self.mapper = DirectDepthMapper( + camera_height=config.CAMERA_HEIGHT, + near_th=config.D_OBSTACLE_MIN, + far_th=config.D_OBSTACLE_MAX, + h_min=config.H_OBSTACLE_MIN, + h_max=config.H_OBSTACLE_MAX, + map_size=config.MAP_SIZE, + map_cell_size=config.MAP_CELL_SIZE, + device=device, + ) + self.planner = DifferentiableStarPlanner( + max_steps=config.PLANNER_MAX_STEPS, + preprocess=config.PREPROCESS_MAP, + beta=config.BETA, + device=device, + ) + self.slam_to_world = 1.0 + self.timestep = 0.1 + self.timing = False + self.reset() + return + + def reset(self): + super(ORBSLAM2Agent, self).reset() + self.offset_to_goal = None + self.tracking_is_OK = False + self.waypointPose6D = None + self.unseen_obstacle = False + self.action_history = [] + self.planned_waypoints = [] + self.map2DObstacles = self.init_map2d() + n, ch, height, width = self.map2DObstacles.size() + self.coordinatesGrid = generate_2dgrid(height, width, False).to( + self.device + ) + self.pose6D = self.init_pose6d() + self.action_history = [] + self.pose6D_history = [] + self.position_history = [] + self.planned2Dpath = torch.zeros((0)) + self.slam.reset() + self.cur_time = 0 + self.toDoList = [] + self.waypoint_id = 0 + if self.device != torch.device("cpu"): + torch.cuda.empty_cache() + return + + def update_internal_state(self, habitat_observation): + super(ORBSLAM2Agent, self).update_internal_state(habitat_observation) + self.cur_time += self.timestep + rgb, depth = self.rgb_d_from_observation(habitat_observation) + t = time.time() + try: + self.slam.process_image_rgbd(rgb, depth, self.cur_time) + if self.timing: + print(time.time() - t, "ORB_SLAM2") + self.tracking_is_OK = str(self.slam.get_tracking_state()) == "OK" + except BaseException: + print("Warning!!!! ORBSLAM processing frame error") + self.tracking_is_OK = False + if not self.tracking_is_OK: + self.reset() + t = time.time() + self.set_offset_to_goal(habitat_observation) + if self.tracking_is_OK: + trajectory_history = np.array(self.slam.get_trajectory_points()) + self.pose6D = homogenize_p( + torch.from_numpy(trajectory_history[-1])[1:] + .view(3, 4) + .to(self.device) + ).view(1, 4, 4) + self.trajectory_history = trajectory_history + if len(self.position_history) > 1: + previous_step = get_distance( + self.pose6D.view(4, 4), + torch.from_numpy(self.position_history[-1]) + .view(4, 4) + .to(self.device), + ) + if self.action_history[-1] == HabitatSimActions.MOVE_FORWARD: + self.unseen_obstacle = ( + previous_step.item() <= 0.001 + ) # hardcoded threshold for not moving + current_obstacles = self.mapper( + torch.from_numpy(depth).to(self.device).squeeze(), self.pose6D + ).to(self.device) + self.current_obstacles = current_obstacles + self.map2DObstacles = torch.max( + self.map2DObstacles, current_obstacles.unsqueeze(0).unsqueeze(0) + ) + if self.timing: + print(time.time() - t, "Mapping") + return True + + def init_pose6d(self): + return torch.eye(4).float().to(self.device) + + def map_size_in_cells(self): + return int(self.map_size_meters / self.map_cell_size) + + def init_map2d(self): + return ( + torch.zeros( + 1, 1, self.map_size_in_cells(), self.map_size_in_cells() + ) + .float() + .to(self.device) + ) + + def get_orientation_on_map(self): + self.pose6D = self.pose6D.view(1, 4, 4) + return torch.tensor( + [ + [self.pose6D[0, 0, 0], self.pose6D[0, 0, 2]], + [self.pose6D[0, 2, 0], self.pose6D[0, 2, 2]], + ] + ) + + def get_position_on_map(self, do_floor=True): + return project_tps_into_worldmap( + self.pose6D.view(1, 4, 4), + self.map_cell_size, + self.map_size_meters, + do_floor, + ) + + def act(self, habitat_observation, random_prob=0.1): + # Update internal state + t = time.time() + cc = 0 + update_is_ok = self.update_internal_state(habitat_observation) + while not update_is_ok: + update_is_ok = self.update_internal_state(habitat_observation) + cc += 1 + if cc > 2: + break + if self.timing: + print(time.time() - t, " s, update internal state") + self.position_history.append( + self.pose6D.detach().cpu().numpy().reshape(1, 4, 4) + ) + success = self.is_goal_reached() + if success: + action = HabitatSimActions.STOP + self.action_history.append(action) + return {"action": action} + # Plan action + t = time.time() + self.planned2Dpath, self.planned_waypoints = self.plan_path() + if self.timing: + print(time.time() - t, " s, Planning") + t = time.time() + # Act + if self.waypointPose6D is None: + self.waypointPose6D = self.get_valid_waypoint_pose6d() + if ( + self.is_waypoint_reached(self.waypointPose6D) + or not self.tracking_is_OK + ): + self.waypointPose6D = self.get_valid_waypoint_pose6d() + self.waypoint_id += 1 + action = self.decide_what_to_do() + # May be random? + random_action = random.randint(0, self.num_actions - 1) + what_to_do = np.random.uniform(0, 1, 1) + if what_to_do < random_prob: + action = random_action + if self.timing: + print(time.time() - t, " s, get action") + self.action_history.append(action) + return {"action": action} + + def is_waypoint_good(self, pose6d): + p_init = self.pose6D.squeeze() + dist_diff = get_distance(p_init, pose6d) + valid = dist_diff > self.next_wp_th + return valid.item() + + def is_waypoint_reached(self, pose6d): + p_init = self.pose6D.squeeze() + dist_diff = get_distance(p_init, pose6d) + reached = dist_diff <= self.pos_th + return reached.item() + + def get_waypoint_dist_dir(self): + angle = get_direction( + self.pose6D.squeeze(), self.waypointPose6D.squeeze(), 0, 0 + ) + dist = get_distance( + self.pose6D.squeeze(), self.waypointPose6D.squeeze() + ) + return torch.cat( + [ + dist.view(1, 1), + torch.sin(angle).view(1, 1), + torch.cos(angle).view(1, 1), + ], + dim=1, + ) + + def get_valid_waypoint_pose6d(self): + p_next = self.planned_waypoints[0] + while not self.is_waypoint_good(p_next): + if len(self.planned_waypoints) > 1: + self.planned_waypoints = self.planned_waypoints[1:] + p_next = self.planned_waypoints[0] + else: + p_next = self.estimatedGoalPos6D.squeeze() + break + return p_next + + def set_offset_to_goal(self, observation): + self.offset_to_goal = ( + torch.from_numpy(observation[GOAL_SENSOR_UUID]) + .float() + .to(self.device) + ) + self.estimatedGoalPos2D = habitat_goalpos_to_mapgoal_pos( + self.offset_to_goal, + self.pose6D.squeeze(), + self.map_cell_size, + self.map_size_meters, + ) + self.estimatedGoalPos6D = planned_path2tps( + [self.estimatedGoalPos2D], + self.map_cell_size, + self.map_size_meters, + 1.0, + ).to(self.device)[0] + return + + def rgb_d_from_observation(self, habitat_observation): + rgb = habitat_observation["rgb"] + depth = None + if "depth" in habitat_observation: + depth = self.depth_denorm * habitat_observation["depth"] + return rgb, depth + + def prev_plan_is_not_valid(self): + if len(self.planned2Dpath) == 0: + return True + pp = torch.cat(self.planned2Dpath).detach().cpu().view(-1, 2) + binary_map = self.map2DObstacles.squeeze().detach() >= self.obstacle_th + obstacles_on_path = ( + binary_map[pp[:, 0].long(), pp[:, 1].long()] + ).long().sum().item() > 0 + return obstacles_on_path # obstacles_nearby or obstacles_on_path + + def rawmap2_planner_ready(self, rawmap, start_map, goal_map): + map1 = (rawmap / float(self.obstacle_th)) ** 2 + map1 = ( + torch.clamp(map1, min=0, max=1.0) + - start_map + - F.max_pool2d(goal_map, 3, stride=1, padding=1) + ) + return torch.relu(map1) + + def plan_path(self, overwrite=False): + t = time.time() + if ( + (not self.prev_plan_is_not_valid()) + and (not overwrite) + and (len(self.planned_waypoints) > 0) + ): + return self.planned2Dpath, self.planned_waypoints + self.waypointPose6D = None + current_pos = self.get_position_on_map() + start_map = torch.zeros_like(self.map2DObstacles).to(self.device) + start_map[ + 0, 0, current_pos[0, 0].long(), current_pos[0, 1].long() + ] = 1.0 + goal_map = torch.zeros_like(self.map2DObstacles).to(self.device) + goal_map[ + 0, + 0, + self.estimatedGoalPos2D[0, 0].long(), + self.estimatedGoalPos2D[0, 1].long(), + ] = 1.0 + path, cost = self.planner( + self.rawmap2_planner_ready( + self.map2DObstacles, start_map, goal_map + ).to(self.device), + self.coordinatesGrid.to(self.device), + goal_map.to(self.device), + start_map.to(self.device), + ) + if len(path) == 0: + return path, [] + if self.timing: + print(time.time() - t, " s, Planning") + t = time.time() + planned_waypoints = planned_path2tps( + path, self.map_cell_size, self.map_size_meters, 1.0, False + ).to(self.device) + return path, planned_waypoints + + def planner_prediction_to_command(self, p_next): + command = HabitatSimActions.STOP + p_init = self.pose6D.squeeze() + d_angle_rot_th = self.angle_th + pos_th = self.pos_th + if get_distance(p_init, p_next) <= pos_th: + return command + d_angle = norm_ang( + get_direction(p_init, p_next, ang_th=d_angle_rot_th, pos_th=pos_th) + ) + if abs(d_angle) < d_angle_rot_th: + command = HabitatSimActions.MOVE_FORWARD + else: + if (d_angle > 0) and (d_angle < pi): + command = HabitatSimActions.TURN_LEFT + elif d_angle > pi: + command = HabitatSimActions.TURN_RIGHT + elif (d_angle < 0) and (d_angle > -pi): + command = HabitatSimActions.TURN_RIGHT + else: + command = HabitatSimActions.TURN_LEFT + return command + + def decide_what_to_do(self): + action = None + if self.is_goal_reached(): + action = HabitatSimActions.STOP + return {"action": action} + if self.unseen_obstacle: + command = HabitatSimActions.TURN_RIGHT + return command + command = HabitatSimActions.STOP + command = self.planner_prediction_to_command(self.waypointPose6D) + return command + + +class ORBSLAM2MonodepthAgent(ORBSLAM2Agent): + def __init__( + self, + config, + device=torch.device("cuda:0"), # noqa: B008 + monocheckpoint="habitat_baselines/slambased/data/mp3d_resnet50.pth", + ): + super(ORBSLAM2MonodepthAgent, self).__init__(config) + self.num_actions = config.NUM_ACTIONS + self.dist_threshold_to_stop = config.DIST_TO_STOP + self.slam_vocab_path = config.SLAM_VOCAB_PATH + assert os.path.isfile(self.slam_vocab_path) + self.slam_settings_path = config.SLAM_SETTINGS_PATH + assert os.path.isfile(self.slam_settings_path) + self.slam = orbslam2.System( + self.slam_vocab_path, self.slam_settings_path, orbslam2.Sensor.RGBD + ) + self.slam.set_use_viewer(False) + self.slam.initialize() + self.device = device + self.map_size_meters = config.MAP_SIZE + self.map_cell_size = config.MAP_CELL_SIZE + self.pos_th = config.DIST_REACHED_TH + self.next_wp_th = config.NEXT_WAYPOINT_TH + self.angle_th = config.ANGLE_TH + self.obstacle_th = config.MIN_PTS_IN_OBSTACLE + self.depth_denorm = config.DEPTH_DENORM + self.planned_waypoints = [] + self.mapper = DirectDepthMapper( + camera_height=config.CAMERA_HEIGHT, + near_th=config.D_OBSTACLE_MIN, + far_th=config.D_OBSTACLE_MAX, + h_min=config.H_OBSTACLE_MIN, + h_max=config.H_OBSTACLE_MAX, + map_size=config.MAP_SIZE, + map_cell_size=config.MAP_CELL_SIZE, + device=device, + ) + self.planner = DifferentiableStarPlanner( + max_steps=config.PLANNER_MAX_STEPS, + preprocess=config.PREPROCESS_MAP, + beta=config.BETA, + device=device, + ) + self.slam_to_world = 1.0 + self.timestep = 0.1 + self.timing = False + self.checkpoint = monocheckpoint + if not os.path.isfile(self.checkpoint): + mp3d_url = "http://cmp.felk.cvut.cz/~mishkdmy/navigation/mp3d_ft_monodepth_resnet50.pth" + # suncg_me_url = "http://cmp.felk.cvut.cz/~mishkdmy/navigation/suncg_me_resnet.pth" + # suncg_mf_url = "http://cmp.felk.cvut.cz/~mishkdmy/navigation/suncg_mf_resnet.pth" + url = mp3d_url + print("No monodepth checkpoint found. Downloading...", url) + download(url, self.checkpoint) + self.monodepth = MonoDepthEstimator(self.checkpoint) + self.reset() + return + + def rgb_d_from_observation(self, habitat_observation): + rgb = habitat_observation["rgb"] + depth = ResizePIL2( + self.monodepth.compute_depth( + PIL.Image.fromarray(rgb).resize((320, 320)) + ), + 256, + ) # /1.75 + depth[depth > 3.0] = 0 + depth[depth < 0.1] = 0 + return rgb, np.array(depth).astype(np.float32) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--agent-type", + default="orbslam2-rgbd", + choices=["blind", "orbslam2-rgbd", "orbslam2-rgb-monod"], + ) + parser.add_argument( + "--task-config", type=str, default="tasks/pointnav_rgbd.yaml" + ) + args = parser.parse_args() + + config = get_config() + agent_config = cfg_baseline() + config.defrost() + config.BASELINE = agent_config.BASELINE + make_good_config_for_orbslam2(config) + + if args.agent_type == "blind": + agent = BlindAgent(config.TRAINER.ORBSLAM2) + elif args.agent_type == "orbslam2-rgbd": + agent = ORBSLAM2Agent(config.TRAINER.ORBSLAM2) + elif args.agent_type == "orbslam2-rgb-monod": + agent = ORBSLAM2MonodepthAgent(config.TRAINER.ORBSLAM2) + else: + raise ValueError(args.agent_type, "is unknown type of agent") + benchmark = habitat.Benchmark(args.task_config) + metrics = benchmark.evaluate(agent) + for k, v in metrics.items(): + habitat.logger.info("{}: {:.3f}".format(k, v)) + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/habitat_baselines/common/base_il_trainer.py b/habitat-lab-dialog/habitat_baselines/common/base_il_trainer.py new file mode 100644 index 0000000..5f60263 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/base_il_trainer.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import os +from collections import OrderedDict +from typing import Dict, List + +import torch + +from habitat import Config +from habitat_baselines.common.base_trainer import BaseTrainer +from habitat_baselines.common.tensorboard_utils import TensorboardWriter + + +class BaseILTrainer(BaseTrainer): + r"""Base trainer class for IL trainers. Future RL-specific + methods should be hosted here. + """ + device: torch.device + config: Config + video_option: List[str] + _flush_secs: int + + def __init__(self, config: Config): + super().__init__() + assert config is not None, "needs config file to initialize trainer" + self.config = config + self._flush_secs = 30 + self._make_dirs() + + @property + def flush_secs(self): + return self._flush_secs + + @flush_secs.setter + def flush_secs(self, value: int): + self._flush_secs = value + + def _make_dirs(self) -> None: + r"""Makes directories for log files, checkpoints & results.""" + self._make_log_dir() + self._make_ckpt_dir() + if self.config.EVAL_SAVE_RESULTS: + self._make_results_dir() + + def _make_log_dir(self) -> None: + r"""Makes directory for writing log files.""" + if self.config.LOG_METRICS and not os.path.isdir( + self.config.OUTPUT_LOG_DIR + ): + os.makedirs(self.config.OUTPUT_LOG_DIR) + + def _make_ckpt_dir(self) -> None: + r"""Makes directory for saving model checkpoints.""" + if not os.path.isdir(self.config.CHECKPOINT_FOLDER): + os.makedirs(self.config.CHECKPOINT_FOLDER) + + def _make_results_dir(self) -> None: + r"""Makes directory for saving eval results.""" + dir_name = self.config.RESULTS_DIR.format(split="val") + os.makedirs(dir_name, exist_ok=True) + + def train(self) -> None: + raise NotImplementedError + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + r"""Evaluates a single checkpoint. Trainer algorithms should + implement this. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + raise NotImplementedError + + def save_checkpoint(self, state_dict: OrderedDict, file_name: str) -> None: + r"""Save checkpoint with specified name. + + Args: + state_dict: model's state_dict + file_name: file name for checkpoint + + Returns: + None + """ + torch.save( + state_dict, os.path.join(self.config.CHECKPOINT_FOLDER, file_name) + ) + + def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict: + raise NotImplementedError diff --git a/habitat-lab-dialog/habitat_baselines/common/base_trainer.py b/habitat-lab-dialog/habitat_baselines/common/base_trainer.py new file mode 100644 index 0000000..4fe5a0f --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/base_trainer.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import time +from typing import Any, ClassVar, Dict, List, Tuple, Union + +import torch +from numpy import ndarray +from torch import Tensor + +from habitat import Config, logger +from habitat.core.env import Env, RLEnv +from habitat.core.vector_env import VectorEnv +from habitat_baselines.common.tensorboard_utils import TensorboardWriter +from habitat_baselines.utils.common import ( + get_checkpoint_id, + poll_checkpoint_folder, +) + + +class BaseTrainer: + r"""Generic trainer class that serves as a base template for more + specific trainer classes like RL trainer, SLAM or imitation learner. + Includes only the most basic functionality. + """ + + supported_tasks: ClassVar[List[str]] + + def train(self) -> None: + raise NotImplementedError + + def _setup_eval_config(self, checkpoint_config: Config) -> Config: + r"""Sets up and returns a merged config for evaluation. Config + object saved from checkpoint is merged into config file specified + at evaluation time with the following overwrite priority: + eval_opts > ckpt_opts > eval_cfg > ckpt_cfg + If the saved config is outdated, only the eval config is returned. + + Args: + checkpoint_config: saved config from checkpoint. + + Returns: + Config: merged config for eval. + """ + + config = self.config.clone() + + ckpt_cmd_opts = checkpoint_config.CMD_TRAILING_OPTS + eval_cmd_opts = config.CMD_TRAILING_OPTS + + try: + config.merge_from_other_cfg(checkpoint_config) + config.merge_from_other_cfg(self.config) + config.merge_from_list(ckpt_cmd_opts) + config.merge_from_list(eval_cmd_opts) + except KeyError: + logger.info("Saved config is outdated, using solely eval config") + config = self.config.clone() + config.merge_from_list(eval_cmd_opts) + config.defrost() + if config.TASK_CONFIG.DATASET.SPLIT == "train": + config.TASK_CONFIG.DATASET.SPLIT = "val" + config.TASK_CONFIG.SIMULATOR.AGENT_0.SENSORS = self.config.SENSORS + config.freeze() + + return config + + def eval(self) -> None: + r"""Main method of trainer evaluation. Calls _eval_checkpoint() that + is specified in Trainer class that inherits from BaseRLTrainer + or BaseILTrainer + + Returns: + None + """ + self.device = ( + torch.device("cuda", self.config.TORCH_GPU_ID) + if torch.cuda.is_available() + else torch.device("cpu") + ) + + if "tensorboard" in self.config.VIDEO_OPTION: + assert ( + len(self.config.TENSORBOARD_DIR) > 0 + ), "Must specify a tensorboard directory for video display" + os.makedirs(self.config.TENSORBOARD_DIR, exist_ok=True) + if "disk" in self.config.VIDEO_OPTION: + assert ( + len(self.config.VIDEO_DIR) > 0 + ), "Must specify a directory for storing videos on disk" + + with TensorboardWriter( + self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) as writer: + if os.path.isfile(self.config.EVAL_CKPT_PATH_DIR): + # evaluate singe checkpoint + proposed_index = get_checkpoint_id( + self.config.EVAL_CKPT_PATH_DIR + ) + if proposed_index is not None: + ckpt_idx = proposed_index + else: + ckpt_idx = 0 + self._eval_checkpoint( + self.config.EVAL_CKPT_PATH_DIR, + writer, + checkpoint_index=ckpt_idx, + ) + else: + # evaluate multiple checkpoints in order + prev_ckpt_ind = -1 + while True: + current_ckpt = None + while current_ckpt is None: + current_ckpt = poll_checkpoint_folder( + self.config.EVAL_CKPT_PATH_DIR, prev_ckpt_ind + ) + time.sleep(2) # sleep for 2 secs before polling again + logger.info(f"=======current_ckpt: {current_ckpt}=======") + prev_ckpt_ind += 1 + self._eval_checkpoint( + checkpoint_path=current_ckpt, + writer=writer, + checkpoint_index=prev_ckpt_ind, + ) + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + raise NotImplementedError + + def save_checkpoint(self, file_name) -> None: + raise NotImplementedError + + def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict: + raise NotImplementedError + + +class BaseRLTrainer(BaseTrainer): + r"""Base trainer class for RL trainers. Future RL-specific + methods should be hosted here. + """ + device: torch.device # type: ignore + config: Config + video_option: List[str] + num_updates_done: int + num_steps_done: int + _flush_secs: int + _last_checkpoint_percent: float + + def __init__(self, config: Config) -> None: + super().__init__() + assert config is not None, "needs config file to initialize trainer" + self.config = config + self._flush_secs = 30 + self.num_updates_done = 0 + self.num_steps_done = 0 + self._last_checkpoint_percent = -1.0 + + if config.NUM_UPDATES != -1 and config.TOTAL_NUM_STEPS != -1: + raise RuntimeError( + "NUM_UPDATES and TOTAL_NUM_STEPS are both specified. One must be -1.\n" + " NUM_UPDATES: {} TOTAL_NUM_STEPS: {}".format( + config.NUM_UPDATES, config.TOTAL_NUM_STEPS + ) + ) + + if config.NUM_UPDATES == -1 and config.TOTAL_NUM_STEPS == -1: + raise RuntimeError( + "One of NUM_UPDATES and TOTAL_NUM_STEPS must be specified.\n" + " NUM_UPDATES: {} TOTAL_NUM_STEPS: {}".format( + config.NUM_UPDATES, config.TOTAL_NUM_STEPS + ) + ) + + if config.NUM_CHECKPOINTS != -1 and config.CHECKPOINT_INTERVAL != -1: + raise RuntimeError( + "NUM_CHECKPOINTS and CHECKPOINT_INTERVAL are both specified." + " One must be -1.\n" + " NUM_CHECKPOINTS: {} CHECKPOINT_INTERVAL: {}".format( + config.NUM_CHECKPOINTS, config.CHECKPOINT_INTERVAL + ) + ) + + if config.NUM_CHECKPOINTS == -1 and config.CHECKPOINT_INTERVAL == -1: + raise RuntimeError( + "One of NUM_CHECKPOINTS and CHECKPOINT_INTERVAL must be specified" + " NUM_CHECKPOINTS: {} CHECKPOINT_INTERVAL: {}".format( + config.NUM_CHECKPOINTS, config.CHECKPOINT_INTERVAL + ) + ) + + def percent_done(self) -> float: + if self.config.NUM_UPDATES != -1: + return self.num_updates_done / self.config.NUM_UPDATES + else: + return self.num_steps_done / self.config.TOTAL_NUM_STEPS + + def is_done(self) -> bool: + return self.percent_done() >= 1.0 + + def should_checkpoint(self) -> bool: + needs_checkpoint = False + if self.config.NUM_CHECKPOINTS != -1: + checkpoint_every = 1 / self.config.NUM_CHECKPOINTS + if ( + self._last_checkpoint_percent + checkpoint_every + < self.percent_done() + ): + needs_checkpoint = True + self._last_checkpoint_percent = self.percent_done() + else: + needs_checkpoint = ( + self.num_steps_done % self.config.CHECKPOINT_INTERVAL + ) == 0 + + return needs_checkpoint + + @property + def flush_secs(self): + return self._flush_secs + + @flush_secs.setter + def flush_secs(self, value: int): + self._flush_secs = value + + def train(self) -> None: + raise NotImplementedError + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + r"""Evaluates a single checkpoint. Trainer algorithms should + implement this. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + raise NotImplementedError + + def save_checkpoint(self, file_name) -> None: + raise NotImplementedError + + def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict: + raise NotImplementedError + + @staticmethod + def _pause_envs( + envs_to_pause: List[int], + envs: Union[VectorEnv, RLEnv, Env], + test_recurrent_hidden_states: Tensor, + not_done_masks: Tensor, + current_episode_reward: Tensor, + prev_actions: Tensor, + batch: Dict[str, Tensor], + rgb_frames: Union[List[List[Any]], List[List[ndarray]]], + ) -> Tuple[ + Union[VectorEnv, RLEnv, Env], + Tensor, + Tensor, + Tensor, + Tensor, + Dict[str, Tensor], + List[List[Any]], + ]: + # pausing self.envs with no new episode + if len(envs_to_pause) > 0: + state_index = list(range(envs.num_envs)) + for idx in reversed(envs_to_pause): + state_index.pop(idx) + envs.pause_at(idx) + + # indexing along the batch dimensions + test_recurrent_hidden_states = test_recurrent_hidden_states[ + state_index + ] + not_done_masks = not_done_masks[state_index] + current_episode_reward = current_episode_reward[state_index] + prev_actions = prev_actions[state_index] + + for k, v in batch.items(): + batch[k] = v[state_index] + + rgb_frames = [rgb_frames[i] for i in state_index] + + return ( + envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) diff --git a/habitat-lab-dialog/habitat_baselines/common/baseline_registry.py b/habitat-lab-dialog/habitat_baselines/common/baseline_registry.py new file mode 100644 index 0000000..2f5b2e6 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/baseline_registry.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r"""BaselineRegistry is extended from habitat.Registry to provide +registration for trainer and environments, while keeping Registry +in habitat core intact. + +Import the baseline registry object using + +.. code:: py + + from habitat_baselines.common.baseline_registry import baseline_registry + +Various decorators for registry different kind of classes with unique keys + +- Register a environment: ``@baseline_registry.register_env`` +- Register a trainer: ``@baseline_registry.register_trainer`` +- Register a policy: ``@baseline_registry.register_policy`` +""" + +from typing import Optional + +from habitat.core.registry import Registry + + +class BaselineRegistry(Registry): + @classmethod + def register_trainer(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a RL training algorithm to registry with key 'name'. + + Args: + name: Key with which the trainer will be registered. + If None will use the name of the class. + + """ + from habitat_baselines.common.base_trainer import BaseTrainer + + return cls._register_impl( + "trainer", to_register, name, assert_type=BaseTrainer + ) + + @classmethod + def get_trainer(cls, name): + return cls._get_impl("trainer", name) + + @classmethod + def register_env(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a environment to registry with key 'name' + currently only support subclass of RLEnv. + + Args: + name: Key with which the env will be registered. + If None will use the name of the class. + + """ + from habitat import RLEnv + + return cls._register_impl("env", to_register, name, assert_type=RLEnv) + + @classmethod + def get_env(cls, name): + return cls._get_impl("env", name) + + @classmethod + def register_policy(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a RL policy with :p:`name`. + + :param name: Key with which the policy will be registered. + If :py:`None` will use the name of the class + + .. code:: py + + from habitat_baselines.rl.ppo.policy import Policy + from habitat_baselines.common.baseline_registry import ( + baseline_registry + ) + + @baseline_registry.register_policy + class MyPolicy(Policy): + pass + + + # or + + @baseline_registry.register_policy(name="MyPolicyName") + class MyPolicy(Policy): + pass + + """ + from habitat_baselines.rl.ppo.policy import Policy + + return cls._register_impl( + "policy", to_register, name, assert_type=Policy + ) + + @classmethod + def get_policy(cls, name: str): + r"""Get the RL policy with :p:`name`.""" + return cls._get_impl("policy", name) + + @classmethod + def register_obs_transformer( + cls, to_register=None, *, name: Optional[str] = None + ): + r"""Register a Observation Transformer with :p:`name`. + + :param name: Key with which the policy will be registered. + If :py:`None` will use the name of the class + + .. code:: py + + from habitat_baselines.common.obs_transformers import ObservationTransformer + from habitat_baselines.common.baseline_registry import ( + baseline_registry + ) + + @baseline_registry.register_policy + class MyObsTransformer(ObservationTransformer): + pass + + + # or + + @baseline_registry.register_policy(name="MyTransformer") + class MyObsTransformer(ObservationTransformer): + pass + + """ + from habitat_baselines.common.obs_transformers import ( + ObservationTransformer, + ) + + return cls._register_impl( + "obs_transformer", + to_register, + name, + assert_type=ObservationTransformer, + ) + + @classmethod + def get_obs_transformer(cls, name: str): + r"""Get the Observation Transformer with :p:`name`.""" + return cls._get_impl("obs_transformer", name) + + +baseline_registry = BaselineRegistry() diff --git a/habitat-lab-dialog/habitat_baselines/common/environments.py b/habitat-lab-dialog/habitat_baselines/common/environments.py new file mode 100644 index 0000000..99eeb14 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/environments.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +r""" +This file hosts task-specific or trainer-specific environments for trainers. +All environments here should be a (direct or indirect ) subclass of Env class +in habitat. Customized environments should be registered using +``@baseline_registry.register_env(name="myEnv")` for reusability +""" + +from typing import Optional, Type + +import habitat +from habitat import Config, Dataset +from habitat_baselines.common.baseline_registry import baseline_registry + + +def get_env_class(env_name: str) -> Type[habitat.RLEnv]: + r"""Return environment class based on name. + + Args: + env_name: name of the environment. + + Returns: + Type[habitat.RLEnv]: env class. + """ + return baseline_registry.get_env(env_name) + + +@baseline_registry.register_env(name="NavRLEnv") +class NavRLEnv(habitat.RLEnv): + def __init__(self, config: Config, dataset: Optional[Dataset] = None): + self._rl_config = config.RL + self._core_env_config = config.TASK_CONFIG + self._reward_measure_name = self._rl_config.REWARD_MEASURE + self._success_measure_name = self._rl_config.SUCCESS_MEASURE + + self._previous_measure = None + self._previous_action = None + super().__init__(self._core_env_config, dataset) + + def reset(self): + self._previous_action = None + observations = super().reset() + self._previous_measure = self._env.get_metrics()[ + self._reward_measure_name + ] + return observations + + def step(self, *args, **kwargs): + self._previous_action = kwargs["action"] + return super().step(*args, **kwargs) + + def get_reward_range(self): + return ( + self._rl_config.SLACK_REWARD - 1.0, + self._rl_config.SUCCESS_REWARD + 1.0, + ) + + def get_reward(self, observations): + reward = self._rl_config.SLACK_REWARD + + current_measure = self._env.get_metrics()[self._reward_measure_name] + + reward += self._previous_measure - current_measure + self._previous_measure = current_measure + + if self._episode_success(): + reward += self._rl_config.SUCCESS_REWARD + + return reward + + def _episode_success(self): + return self._env.get_metrics()[self._success_measure_name] + + def get_done(self, observations): + done = False + if self._env.episode_over or self._episode_success(): + done = True + return done + + def get_info(self, observations): + return self.habitat_env.get_metrics() diff --git a/habitat-lab-dialog/habitat_baselines/common/obs_transformers.py b/habitat-lab-dialog/habitat_baselines/common/obs_transformers.py new file mode 100644 index 0000000..9617132 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/obs_transformers.py @@ -0,0 +1,1223 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the + +# LICENSE file in the root directory of this source tree. + +r"""This module defines various ObservationTransformers that can be used +to transform the output of the simulator before they are fed into the +policy of the neural network. This can include various useful preprocessing +including faking a semantic sensor using RGB input and MaskRCNN or faking +a depth sensor using RGB input. You can also stich together multiple sensors. +This code runs on the batched of inputs to these networks efficiently. +ObservationTransformer all run as nn.modules and can be used for encoders or +any other neural networks preprocessing steps. +Assumes the input is on CUDA. + +They also implement a function that transforms that observation space so help +fake or modify sensor input from the simulator. + +This module API is experimental and likely to change +""" +import abc +import copy +import numbers +from enum import Enum +from typing import Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np +import torch +from gym import spaces +from torch import nn + +from habitat.config import Config +from habitat.core.logging import logger +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.utils.common import ( + center_crop, + get_image_height_width, + image_resize_shortest_edge, + overwrite_gym_box_shape, +) + + +class ObservationTransformer(nn.Module, metaclass=abc.ABCMeta): + """This is the base ObservationTransformer class that all other observation + Transformers should extend. from_config must be implemented by the transformer. + transform_observation_space is only needed if the observation_space ie. + (resolution, range, or num of channels change).""" + + def transform_observation_space( + self, observation_space: spaces.Dict, **kwargs + ): + return observation_space + + @classmethod + @abc.abstractmethod + def from_config(cls, config: Config): + pass + + def forward( + self, observations: Dict[str, torch.Tensor] + ) -> Dict[str, torch.Tensor]: + return observations + + +@baseline_registry.register_obs_transformer() +class ResizeShortestEdge(ObservationTransformer): + r"""An nn module the resizes your the shortest edge of the input while maintaining aspect ratio. + This module assumes that all images in the batch are of the same size. + """ + + def __init__( + self, + size: int, + channels_last: bool = True, + trans_keys: Tuple[str] = ("rgb", "depth", "semantic"), + ): + """Args: + size: The size you want to resize the shortest edge to + channels_last: indicates if channels is the last dimension + """ + super(ResizeShortestEdge, self).__init__() + self._size: int = size + self.channels_last: bool = channels_last + self.trans_keys: Tuple[str] = trans_keys + + def transform_observation_space( + self, + observation_space: spaces.Dict, + ): + size = self._size + observation_space = copy.deepcopy(observation_space) + if size: + for key in observation_space.spaces: + if key in self.trans_keys: + # In the observation space dict, the channels are always last + h, w = get_image_height_width( + observation_space.spaces[key], channels_last=True + ) + if size == min(h, w): + continue + scale = size / min(h, w) + new_h = int(h * scale) + new_w = int(w * scale) + new_size = (new_h, new_w) + logger.info( + "Resizing observation of %s: from %s to %s" + % (key, (h, w), new_size) + ) + observation_space.spaces[key] = overwrite_gym_box_shape( + observation_space.spaces[key], new_size + ) + return observation_space + + def _transform_obs(self, obs: torch.Tensor) -> torch.Tensor: + return image_resize_shortest_edge( + obs, self._size, channels_last=self.channels_last + ) + + @torch.no_grad() + def forward( + self, observations: Dict[str, torch.Tensor] + ) -> Dict[str, torch.Tensor]: + if self._size is not None: + observations.update( + { + sensor: self._transform_obs(observations[sensor]) + for sensor in self.trans_keys + if sensor in observations + } + ) + return observations + + @classmethod + def from_config(cls, config: Config): + return cls(config.RL.POLICY.OBS_TRANSFORMS.RESIZE_SHORTEST_EDGE.SIZE) + + +@baseline_registry.register_obs_transformer() +class CenterCropper(ObservationTransformer): + """An observation transformer is a simple nn module that center crops your input.""" + + def __init__( + self, + size: Union[int, Tuple[int, int]], + channels_last: bool = True, + trans_keys: Tuple[str] = ("rgb", "depth", "semantic"), + ): + """Args: + size: A sequence (h, w) or int of the size you wish to resize/center_crop. + If int, assumes square crop + channels_list: indicates if channels is the last dimension + trans_keys: The list of sensors it will try to centercrop. + """ + super().__init__() + if isinstance(size, numbers.Number): + size = (int(size), int(size)) + assert len(size) == 2, "forced input size must be len of 2 (h, w)" + self._size = size + self.channels_last = channels_last + self.trans_keys = trans_keys # TODO: Add to from_config constructor + + def transform_observation_space( + self, + observation_space: spaces.Dict, + ): + size = self._size + observation_space = copy.deepcopy(observation_space) + if size: + for key in observation_space.spaces: + if ( + key in self.trans_keys + and observation_space.spaces[key].shape[-3:-1] != size + ): + h, w = get_image_height_width( + observation_space.spaces[key], channels_last=True + ) + logger.info( + "Center cropping observation size of %s from %s to %s" + % (key, (h, w), size) + ) + + observation_space.spaces[key] = overwrite_gym_box_shape( + observation_space.spaces[key], size + ) + return observation_space + + def _transform_obs(self, obs: torch.Tensor) -> torch.Tensor: + return center_crop( + obs, + self._size, + channels_last=self.channels_last, + ) + + @torch.no_grad() + def forward( + self, observations: Dict[str, torch.Tensor] + ) -> Dict[str, torch.Tensor]: + if self._size is not None: + observations.update( + { + sensor: self._transform_obs(observations[sensor]) + for sensor in self.trans_keys + if sensor in observations + } + ) + return observations + + @classmethod + def from_config(cls, config: Config): + cc_config = config.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER + return cls( + ( + cc_config.HEIGHT, + cc_config.WIDTH, + ) + ) + + +class _DepthFrom(Enum): + Z_VAL = 0 + OPTI_CENTER = 1 + + +class CameraProjection(metaclass=abc.ABCMeta): + """This is the base CameraProjection class that converts + projection model of images into different one. This can be used for + conversion between cubemap, equirect, fisheye images, etc. + projection that project 3D points onto the image plane and + unprojection that project image points onto unit sphere + must be implemented.""" + + def __init__( + self, + img_h: int, + img_w: int, + R: Optional[torch.Tensor] = None, + depth_from: _DepthFrom = _DepthFrom.OPTI_CENTER, + ): + """Args: + img_h: (int) the height of camera image + img_w: (int) the width of camera image + R: (torch.Tensor) 3x3 rotation matrix of camera + depth_from: (_DepthFrom) the depth from z value or optical center + """ + self.img_h = img_h + self.img_w = img_w + self.depth_from = depth_from + + # Camera rotation: points in world coord = R @ points in camera coord + if R is not None: + self.R = R.float() + else: + self.R = None + + @abc.abstractmethod + def projection( + self, world_pts: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Project points in world coord onto image planes. + Args: + world_pts: 3D points in world coord + Returns: + proj_pts: Projected points for grid_sample, -1 <= proj_pts <= 1 + valid_mask: True if the point is valid (inside FoV) + """ + + @abc.abstractmethod + def unprojection( + self, with_rotation: bool = True + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Unproject 2D image points onto unit sphere. + Args: + with_rotation: If True, unprojected points is in world coord. + If False, unprojected points is in camera coord. + Returns: + unproj_pts: Unprojected 3D points on unit sphere + valid_mask: True if the point is valid (inside FoV) + """ + + @property + def rotation(self): + """Camera rotation: points in world coord = R @ points in camera coord""" + if self.R is None: + return torch.eye(3, dtype=torch.float32) + else: + return self.R + + @property + def shape(self): + """Camera image shape: (img_h, img_w)""" + return (self.img_h, self.img_w) + + def size(self): + """Camera image shape: (img_h, img_w)""" + return self.shape + + def camcoord2worldcoord(self, pts: torch.Tensor): + """Convert points in camera coords into points in world coords. + Args: + pts: 3D points in camera coords + Returns: + rotated_pts: 3D points in world coords + """ + if self.R is None: + return pts + else: + # Rotate points according to camera rotation + _h, _w, _ = pts.shape + # points in world coord = R @ points in camera coord + rotated_pts = torch.matmul(pts.view((-1, 3)), self.R.T) + return rotated_pts.view(_h, _w, 3) + + def worldcoord2camcoord(self, pts: torch.Tensor): + """Convert points in world coords into points in camera coords. + Args: + pts: 3D points in world coords + Returns: + rotated_pts: 3D points in camera coords + """ + if self.R is None: + return pts + else: + # Rotate points according to camera rotation + _h, _w, _ = pts.shape + # points in camera coord = R.T @ points in world coord + rotated_pts = torch.matmul(pts.view((-1, 3)), self.R) + return rotated_pts.view(_h, _w, 3) + + +class PerspectiveProjection(CameraProjection): + """This is the perspective camera projection class.""" + + def __init__( + self, + img_h: int, + img_w: int, + f: Optional[float] = None, + R: Optional[torch.Tensor] = None, + ): + """Args: + img_h: (int) the height of camera image + img_w: (int) the width of camera image + f: (float) the focal length of camera + R: (torch.Tensor) 3x3 rotation matrix of camera + """ + super(PerspectiveProjection, self).__init__( + img_h, img_w, R, _DepthFrom.Z_VAL + ) + if f is None: + self.f = max(img_h, img_w) / 2 + else: + self.f = f + + def projection( + self, world_pts: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Rotate world points according to camera rotation + world_pts = self.worldcoord2camcoord(world_pts) + + # Project points onto image plane + img_pts = self.f * world_pts / torch.abs(world_pts[..., 2:3]) + cx = self.img_w / 2 + cy = self.img_h / 2 + u = img_pts[..., 0] + cx + v = img_pts[..., 1] + cy + + # For grid_sample, -1 <= proj_pts <= 1 + mapx = 2 * u / self.img_w - 1.0 + mapy = 2 * v / self.img_h - 1.0 + proj_pts = torch.stack([mapx, mapy], dim=-1) + + # Valid mask + valid_mask = torch.abs(proj_pts).max(-1)[0] <= 1 # -1 <= grid.xy <= 1 + valid_mask *= img_pts[..., 2] > 0 + return proj_pts, valid_mask + + def unprojection( + self, with_rotation: bool = True + ) -> Tuple[torch.Tensor, torch.Tensor]: + v, u = torch.meshgrid( + torch.arange(self.img_h), torch.arange(self.img_w) + ) + x = (u + 0.5) - self.img_w / 2 + y = (v + 0.5) - self.img_h / 2 + z = torch.full_like(x, self.f, dtype=torch.float) + unproj_pts = torch.stack([x, y, z], dim=-1) + # Project on unit shpere + unproj_pts /= torch.norm(unproj_pts, dim=-1, keepdim=True) + # All points in image are valid + valid_mask = torch.full(unproj_pts.shape[:2], True, dtype=torch.bool) + + # Rotate unproj_pts points according to camera rotation + if with_rotation: + unproj_pts = self.camcoord2worldcoord(unproj_pts) + + return unproj_pts, valid_mask + + +class EquirectProjection(CameraProjection): + """This is the equirectanglar camera projection class.""" + + def __init__( + self, img_h: int, img_w: int, R: Optional[torch.Tensor] = None + ): + """Args: + img_h: (int) the height of equirectanglar camera image + img_w: (int) the width of equirectanglar camera image + R: (torch.Tensor) 3x3 rotation matrix of camera + """ + super(EquirectProjection, self).__init__(img_h, img_w, R) + + def projection( + self, world_pts: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Rotate world points according to camera rotation + world_pts = self.worldcoord2camcoord(world_pts) + + x, y, z = world_pts[..., 0], world_pts[..., 1], world_pts[..., 2] + # x,y,z to theta, phi + theta = torch.atan2(x, z) + c = torch.sqrt(x * x + z * z) + phi = torch.atan2(y, c) + + # For grid_sample, -1 <= proj_pts <= 1 + mapx = theta / np.pi + mapy = phi / (np.pi / 2) + proj_pts = torch.stack([mapx, mapy], dim=-1) + + # All points in image are valid + valid_mask = torch.full(proj_pts.shape[:2], True, dtype=torch.bool) + return proj_pts, valid_mask + + def unprojection( + self, with_rotation: bool = True + ) -> Tuple[torch.Tensor, torch.Tensor]: + theta_map, phi_map = self.get_theta_phi_map(self.img_h, self.img_w) + unproj_pts = self.angle2sphere(theta_map, phi_map) + # All points in image are valid + valid_mask = torch.full(unproj_pts.shape[:2], True, dtype=torch.bool) + # Rotate unproj_pts points according to camera rotation + if with_rotation: + unproj_pts = self.camcoord2worldcoord(unproj_pts) + return unproj_pts, valid_mask + + def get_theta_phi_map( + self, img_h: int, img_w: int + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Get theta and phi map for equirectangular image. + PI < theta_map < PI, PI/2 < phi_map < PI/2 + """ + phi, theta = torch.meshgrid(torch.arange(img_h), torch.arange(img_w)) + theta_map = (theta + 0.5) * 2 * np.pi / img_w - np.pi + phi_map = (phi + 0.5) * np.pi / img_h - np.pi / 2 + return theta_map, phi_map + + def angle2sphere( + self, theta_map: torch.Tensor, phi_map: torch.Tensor + ) -> torch.Tensor: + """Project points on unit sphere based on theta and phi map.""" + sin_theta = torch.sin(theta_map) + cos_theta = torch.cos(theta_map) + sin_phi = torch.sin(phi_map) + cos_phi = torch.cos(phi_map) + return torch.stack( + [cos_phi * sin_theta, sin_phi, cos_phi * cos_theta], dim=-1 + ) + + +class FisheyeProjection(CameraProjection): + r"""This is the fisheye camera projection class. + The camera model is based on the Double Sphere Camera Model (Usenko et. al.;3DV 2018). + Paper: https://arxiv.org/abs/1807.08957 + Implementation: https://github.com/matsuren/dscamera + """ + + def __init__( + self, + img_h: int, + img_w: int, + fish_fov: float, + cx: float, + cy: float, + fx: float, + fy: float, + xi: float, + alpha: float, + R: Optional[torch.Tensor] = None, + ): + """Args: + img_h: (int) the height of fisheye camera image + img_w: (int) the width of fisheye camera image + fish_fov: (float) the fov of fisheye camera in degrees + cx, cy: (float) the optical center of the fisheye camera + fx, fy, xi, alpha: (float) the fisheye camera model parameters + R: (torch.Tensor) 3x3 rotation matrix of camera + """ + super(FisheyeProjection, self).__init__(img_h, img_w, R) + + self.fish_fov = fish_fov # FoV in degrees + fov_rad = self.fish_fov / 180 * np.pi # FoV in radians + self.fov_cos = np.cos(fov_rad / 2) + self.fish_param = [cx, cy, fx, fy, xi, alpha] + + def projection( + self, world_pts: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Rotate world points according to camera rotation + world_pts = self.worldcoord2camcoord(world_pts) + + # Unpack parameters + cx, cy, fx, fy, xi, alpha = self.fish_param + # Unpack 3D world points + x, y, z = world_pts[..., 0], world_pts[..., 1], world_pts[..., 2] + + # Calculate fov + world_pts_fov_cos = z # point3D @ z_axis + fov_mask = world_pts_fov_cos >= self.fov_cos + + # Calculate projection + x2 = x * x + y2 = y * y + z2 = z * z + d1 = torch.sqrt(x2 + y2 + z2) + zxi = xi * d1 + z + d2 = torch.sqrt(x2 + y2 + zxi * zxi) + + div = alpha * d2 + (1 - alpha) * zxi + u = fx * x / div + cx + v = fy * y / div + cy + + # Projected points on image plane + # For grid_sample, -1 <= proj_pts <= 1 + mapx = 2 * u / self.img_w - 1.0 + mapy = 2 * v / self.img_h - 1.0 + proj_pts = torch.stack([mapx, mapy], dim=-1) + + # Check valid area + if alpha <= 0.5: + w1 = alpha / (1 - alpha) + else: + w1 = (1 - alpha) / alpha + w2 = w1 + xi / np.sqrt(2 * w1 * xi + xi * xi + 1) + valid_mask = z > -w2 * d1 + valid_mask *= fov_mask + + return proj_pts, valid_mask + + def unprojection( + self, with_rotation: bool = True + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Unpack parameters + cx, cy, fx, fy, xi, alpha = self.fish_param + + # Calculate unprojection + v, u = torch.meshgrid( + [torch.arange(self.img_h), torch.arange(self.img_w)] + ) + mx = (u - cx) / fx + my = (v - cy) / fy + r2 = mx * mx + my * my + mz = (1 - alpha * alpha * r2) / ( + alpha * torch.sqrt(1 - (2 * alpha - 1) * r2) + 1 - alpha + ) + mz2 = mz * mz + + k1 = mz * xi + torch.sqrt(mz2 + (1 - xi * xi) * r2) + k2 = mz2 + r2 + k = k1 / k2 + + # Unprojected unit vectors + unproj_pts = k.unsqueeze(-1) * torch.stack([mx, my, mz], dim=-1) + unproj_pts[..., 2] -= xi + + # Calculate fov + unproj_fov_cos = unproj_pts[..., 2] # unproj_pts @ z_axis + fov_mask = unproj_fov_cos >= self.fov_cos + if alpha > 0.5: + fov_mask *= r2 <= (1 / (2 * alpha - 1)) + + # Rotate unproj_pts points according to camera rotation + if with_rotation: + unproj_pts = self.camcoord2worldcoord(unproj_pts) + + return unproj_pts, fov_mask + + +class ProjectionConverter(nn.Module): + r"""This is the implementation to convert {cubemap, equirect, fisheye} images + into {perspective, equirect, fisheye} images. + """ + + def __init__( + self, + input_projections: Union[List[CameraProjection], CameraProjection], + output_projections: Union[List[CameraProjection], CameraProjection], + ): + """Args: + input_projections: input images of projection models + output_projections: generated image of projection models + """ + super(ProjectionConverter, self).__init__() + # Convert to list + if not isinstance(input_projections, list): + input_projections = [input_projections] + if not isinstance(output_projections, list): + output_projections = [output_projections] + + self.input_models = input_projections + self.output_models = output_projections + self.input_len = len(self.input_models) + self.output_len = len(self.output_models) + + # Check image size + input_size = self.input_models[0].size() + for it in self.input_models: + assert ( + input_size == it.size() + ), "All input models must have the same image size" + + output_size = self.output_models[0].size() + for it in self.output_models: + assert ( + output_size == it.size() + ), "All output models must have the same image size" + + # Check if depth conversion is required + # If depth is in z value in input, conversion is required + self.input_zfactor = self.calculate_zfactor(self.input_models) + # If depth is in z value in output, inverse conversion is required + self.output_zfactor = self.calculate_zfactor( + self.output_models, inverse=True + ) + + # grids shape: (output_len, input_len, output_img_h, output_img_w, 2) + self.grids = self.generate_grid() + # _grids_cache shape: (batch_size*output_len*input_len, output_img_h, output_img_w, 2) + self._grids_cache = None + + def _generate_grid_one_output( + self, output_model: CameraProjection + ) -> torch.Tensor: + # Obtain points on unit sphere + world_pts, not_assigned_mask = output_model.unprojection() + # Generate grid + grids = [] + for input_model in self.input_models: + grid, input_mask = input_model.projection(world_pts) + # Make sure each point is only assigned to single input + input_mask *= not_assigned_mask + # Values bigger than one will be ignored by grid_sample + grid[~input_mask] = 2 + # Update not_assigned_mask + not_assigned_mask *= ~input_mask + grids.append(grid) + grids = torch.stack(grids, dim=0) + return grids + + def generate_grid(self) -> torch.Tensor: + multi_output_grids = [] + for output_model in self.output_models: + grids = self._generate_grid_one_output(output_model) + multi_output_grids.append(grids.unsqueeze(1)) + multi_output_grids = torch.cat(multi_output_grids, dim=1) + return multi_output_grids # input_len, output_len, output_img_h, output_img_w, 2 + + def _convert(self, batch: torch.Tensor) -> torch.Tensor: + """Takes a batch of images stacked in proper order and converts thems, + reduces batch size by input_len.""" + batch_size, ch, _H, _W = batch.shape + out_h, out_w = self.output_models[0].size() + if batch_size == 0 or batch_size % self.input_len != 0: + raise ValueError(f"Batch size should be {self.input_len}x") + output = torch.nn.functional.grid_sample( + batch, + self._grids_cache, + align_corners=True, + padding_mode="zeros", + ) + output = output.view( + batch_size // self.input_len, + self.input_len, + ch, + out_h, + out_w, + ).sum(dim=1) + return output # output_len * batch_size, ch, output_model.img_h, output_model.img_w + + def to_converted_tensor(self, batch: torch.Tensor) -> torch.Tensor: + """Convert tensors based on projection models. If there are two + batches from two envs (R_1st, G_1st, B_1st) and (R_2nd, G_2nd, B_2nd), + the input order is [R_1st, G_1st, B_1st, R_2nd, G_2nd, B_2nd] + """ + # batch tensor order should be NCHW + batch_size, ch, in_h, in_w = batch.size() + + out_h, out_w = self.output_models[0].size() + + # Check whether batch size is len(self.input_models) x + if batch_size == 0 or batch_size % self.input_len != 0: + raise ValueError(f"Batch size should be {self.input_len}x") + + # How many sets of input. + num_input_set = batch_size // self.input_len + + # to(device) is a NOOP after the first call + self.grids = self.grids.to(batch.device) + + # Adjust batch for multiple outputs + # batch must be [1st batch * output_len, 2nd batch * output_len, ...] + # not that [1st batch, 2nd batch, ...] * output_len + multi_out_batch = ( + batch.view(num_input_set, self.input_len, ch, in_h, in_w) + .repeat(1, self.output_len, 1, 1, 1) + .view(self.output_len * batch_size, ch, in_h, in_w) + ) + + # Cache the repeated grids for subsequent batches + if ( + self._grids_cache is None + or self._grids_cache.size()[0] != multi_out_batch.size()[0] + ): + # batch size is more than one + self._grids_cache = self.grids.repeat( + num_input_set, 1, 1, 1, 1 + ).view(batch_size * self.output_len, out_h, out_w, 2) + self._grids_cache = self._grids_cache.to(batch.device) + + return self._convert(multi_out_batch) + + def calculate_zfactor( + self, projections: List[CameraProjection], inverse: bool = False + ) -> Optional[torch.Tensor]: + """Calculate z factor based on camera projection models. z_factor is + used for converting depth in z value to depth from optical center + (for input_models) or conversion of depth from optical center to depth + in z value (inverse = True, for output_models). Whether the conversion + is required or not is decided based on depth_from property of + CameraProjection class. + Args: + projections: input or output projection models + inverse: True to convert depth from optical center to z value + False to convert z value to depth from optical center + Returns: + z_factors: z factor. Return None if conversion is not required. + """ + z_factors = [] + for cam in projections: + if cam.depth_from == _DepthFrom.Z_VAL: + pts_on_sphere, _ = cam.unprojection(with_rotation=False) + zval_to_optcenter = 1 / pts_on_sphere[..., 2] + z_factors.append(zval_to_optcenter.unsqueeze(0)) + else: + all_one = torch.full( + (1, cam.img_h, cam.img_w), 1.0, dtype=torch.float + ) + z_factors.append(all_one) + z_factors = torch.stack(z_factors) + + if (z_factors == 1.0).all(): + # All input cameras have depth from optical center + return None + else: + if not inverse: + # for input_models + return z_factors + else: + # for output_models + return 1 / z_factors + + def forward( + self, batch: torch.Tensor, is_depth: bool = False + ) -> torch.Tensor: + + # Depth conversion for input tensors + if is_depth and self.input_zfactor is not None: + input_b = batch.size()[0] // self.input_len + self.input_zfactor = self.input_zfactor.to(batch.device) + batch = batch * self.input_zfactor.repeat(input_b, 1, 1, 1) + + # Common operator to convert projection models + out = self.to_converted_tensor(batch) + + # Depth conversion for output tensors + if is_depth and self.output_zfactor is not None: + output_b = out.size()[0] // self.output_len + self.output_zfactor = self.output_zfactor.to(batch.device) + out = out * self.output_zfactor.repeat(output_b, 1, 1, 1) + + return out + + +def get_cubemap_projections( + img_h: int = 256, img_w: int = 256 +) -> List[CameraProjection]: + """Get cubemap camera projections that consist of six PerspectiveCameras. + The orders are 'BACK', 'DOWN', 'FRONT', 'LEFT', 'RIGHT', 'UP'. + Args: + img_h: (int) the height of camera image + img_w: (int) the width of camera image + + The rotation matrices are equivalent to + .. code-block:: python + from scipy.spatial.transform import Rotation + rotations = [ + Rotation.from_euler("y", 180, degrees=True), # Back + Rotation.from_euler("x", -90, degrees=True), # Down + Rotation.from_euler("x", 0, degrees=True), # Front + Rotation.from_euler("y", -90, degrees=True), # Left + Rotation.from_euler("y", 90, degrees=True), # Right + Rotation.from_euler("x", 90, degrees=True) # Up + ] + """ + rotations = [ + torch.tensor([[-1, 0, 0], [0, 1, 0], [0, 0, -1]]), # Back + torch.tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]), # Down + torch.tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]), # Front + torch.tensor([[0, 0, -1], [0, 1, 0], [1, 0, 0]]), # Left + torch.tensor([[0, 0, 1], [0, 1, 0], [-1, 0, 0]]), # Right + torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]), # Up + ] + + projections = [] + for rot in rotations: + cam = PerspectiveProjection(img_h, img_w, R=rot) + projections.append(cam) + return projections + + +class Cube2Equirect(ProjectionConverter): + """This is the backend Cube2Equirect nn.module that does the stiching. + Inspired from https://github.com/fuenwang/PanoramaUtility and + optimized for modern PyTorch.""" + + def __init__(self, equ_h: int, equ_w: int): + """Args: + equ_h: (int) the height of the generated equirect + equ_w: (int) the width of the generated equirect + """ + + # Cubemap input + input_projections = get_cubemap_projections() + + # Equirectangular output + output_projection = EquirectProjection(equ_h, equ_w) + super(Cube2Equirect, self).__init__( + input_projections, output_projection + ) + + +class ProjectionTransformer(ObservationTransformer): + r""" + ProjectionTransformer base class. It can be used to convert {cubemap, equirect, fisheye} images + into {perspective, equirect, fisheye} images in ObservationTransformer. + """ + + def __init__( + self, + converter: ProjectionConverter, + sensor_uuids: List[str], + image_shape: Tuple[int, int], + channels_last: bool = False, + target_uuids: Optional[List[str]] = None, + depth_key: str = "depth", + ): + r""":param converter: ProjectionConverter class + :param sensor_uuids: List of sensor_uuids + :param image_shape: The shape of the output image (height, width) + :param channels_last: Are the channels last in the input + :param target_uuids: Optional List of which of the sensor_uuids to overwrite + :param depth_key: If sensor_uuids has depth_key substring, they are processed as depth + """ + super(ProjectionTransformer, self).__init__() + num_sensors = len(sensor_uuids) + assert ( + num_sensors % converter.input_len == 0 and num_sensors != 0 + ), f"{len(sensor_uuids)}: length of sensors is not a multiple of {converter.input_len}" + # TODO verify attributes of the sensors in the config if possible. Think about API design + assert ( + len(image_shape) == 2 + ), f"image_shape must be a tuple of (height, width), given: {image_shape}" + self.sensor_uuids: List[str] = sensor_uuids + self.img_shape: Tuple[int, int] = image_shape + self.channels_last: bool = channels_last + self.converter = converter + if target_uuids is None: + self.target_uuids: List[str] = self.sensor_uuids[::6] + else: + self.target_uuids: List[str] = target_uuids + self.depth_key = depth_key + + def transform_observation_space( + self, + observation_space: spaces.Dict, + ): + r"""Transforms the target UUID's sensor obs_space so it matches the new shape (H, W)""" + # Transforms the observation space to of the target UUID + for i, key in enumerate(self.target_uuids): + assert ( + key in observation_space.spaces + ), f"{key} not found in observation space: {observation_space.spaces}" + h, w = get_image_height_width( + observation_space.spaces[key], channels_last=True + ) + in_len = self.converter.input_len + logger.info( + f"Overwrite sensor: {key} from size of ({h}, {w}) to image of" + f" {self.img_shape} from sensors: {self.sensor_uuids[i*in_len:(i+1)*in_len]}" + ) + if (h, w) != self.img_shape: + observation_space.spaces[key] = overwrite_gym_box_shape( + observation_space.spaces[key], self.img_shape + ) + return observation_space + + @torch.no_grad() + def forward( + self, observations: Dict[str, torch.Tensor] + ) -> Dict[str, torch.Tensor]: + + for i, target_sensor_uuid in enumerate(self.target_uuids): + # number of input and input sensor uuids + in_len = self.converter.input_len + in_sensor_uuids = self.sensor_uuids[i * in_len : (i + 1) * in_len] + + # If the sensor is depth + is_depth = any(self.depth_key in s for s in in_sensor_uuids) + + # The UUID we are overwriting + assert target_sensor_uuid in in_sensor_uuids + sensor_obs = [observations[sensor] for sensor in in_sensor_uuids] + target_obs = observations[target_sensor_uuid] + sensor_dtype = target_obs.dtype + # Stacking along axis makes the flattening go in the right order. + imgs = torch.stack(sensor_obs, axis=1) + imgs = torch.flatten(imgs, end_dim=1) + if not self.channels_last: + imgs = imgs.permute((0, 3, 1, 2)) # NHWC => NCHW + imgs = imgs.float() # NCHW + # Here is where the projection conversion happens + output = self.converter(imgs, is_depth=is_depth) + + # for debugging + # torchvision.utils.save_image(output, f'sample_eqr_{target_sensor_uuid}.jpg', normalize=True, range=(0, 255) if 'rgb' in target_sensor_uuid else (0, 1)) + output = output.to(dtype=sensor_dtype) + if not self.channels_last: + output = output.permute((0, 2, 3, 1)) # NCHW => NHWC + observations[target_sensor_uuid] = output + return observations + + +@baseline_registry.register_obs_transformer() +class CubeMap2Equirect(ProjectionTransformer): + r"""This is an experimental use of ObservationTransformer that converts a cubemap + output to an equirectangular one through projection. This needs to be fed + a list of 6 cameras at various orientations but will be able to stitch a + 360 sensor out of these inputs. The code below will generate a config that + has the 6 sensors in the proper orientations. This code also assumes a 90 + FOV. + + Sensor order for cubemap stiching is Back, Down, Front, Left, Right, Up. + The output will be writen the UUID of the first sensor. + """ + + def __init__( + self, + sensor_uuids: List[str], + eq_shape: Tuple[int, int], + channels_last: bool = False, + target_uuids: Optional[List[str]] = None, + depth_key: str = "depth", + ): + r""":param sensor_uuids: List of sensor_uuids: Back, Down, Front, Left, Right, Up. + :param eq_shape: The shape of the equirectangular output (height, width) + :param channels_last: Are the channels last in the input + :param target_uuids: Optional List of which of the sensor_uuids to overwrite + :param depth_key: If sensor_uuids has depth_key substring, they are processed as depth + """ + + converter = Cube2Equirect(eq_shape[0], eq_shape[1]) + super(CubeMap2Equirect, self).__init__( + converter, + sensor_uuids, + eq_shape, + channels_last, + target_uuids, + depth_key, + ) + + @classmethod + def from_config(cls, config): + cube2eq_config = config.RL.POLICY.OBS_TRANSFORMS.CUBE2EQ + if hasattr(cube2eq_config, "TARGET_UUIDS"): + # Optional Config Value to specify target UUID + target_uuids = cube2eq_config.TARGET_UUIDS + else: + target_uuids = None + return cls( + cube2eq_config.SENSOR_UUIDS, + eq_shape=( + cube2eq_config.HEIGHT, + cube2eq_config.WIDTH, + ), + target_uuids=target_uuids, + ) + + +class Cube2Fisheye(ProjectionConverter): + r"""This is the implementation to generate fisheye images from cubemap images. + The camera model is based on the Double Sphere Camera Model (Usenko et. al.;3DV 2018). + Paper: https://arxiv.org/abs/1807.08957 + """ + + def __init__( + self, + fish_h: int, + fish_w: int, + fish_fov: float, + cx: float, + cy: float, + fx: float, + fy: float, + xi: float, + alpha: float, + ): + """Args: + fish_h: (int) the height of the generated fisheye + fish_w: (int) the width of the generated fisheye + fish_fov: (float) the fov of the generated fisheye in degrees + cx, cy: (float) the optical center of the generated fisheye + fx, fy, xi, alpha: (float) the fisheye camera model parameters + """ + + # Cubemap input + input_projections = get_cubemap_projections() + + # Fisheye output + output_projection = FisheyeProjection( + fish_h, fish_w, fish_fov, cx, cy, fx, fy, xi, alpha + ) + super(Cube2Fisheye, self).__init__( + input_projections, output_projection + ) + + +@baseline_registry.register_obs_transformer() +class CubeMap2Fisheye(ProjectionTransformer): + r"""This is an experimental use of ObservationTransformer that converts a cubemap + output to a fisheye one through projection. This needs to be fed + a list of 6 cameras at various orientations but will be able to stitch a + fisheye image out of these inputs. The code below will generate a config that + has the 6 sensors in the proper orientations. This code also assumes a 90 + FOV. + + Sensor order for cubemap stiching is Back, Down, Front, Left, Right, Up. + The output will be writen the UUID of the first sensor. + """ + + def __init__( + self, + sensor_uuids: List[str], + fish_shape: Tuple[int, int], + fish_fov: float, + fish_params: Tuple[float], + channels_last: bool = False, + target_uuids: Optional[List[str]] = None, + depth_key: str = "depth", + ): + r""":param sensor_uuids: List of sensor_uuids: Back, Down, Front, Left, Right, Up. + :param fish_shape: The shape of the fisheye output (height, width) + :param fish_fov: The FoV of the fisheye output in degrees + :param fish_params: The camera parameters of fisheye output (f, xi, alpha) + :param channels_last: Are the channels last in the input + :param target_uuids: Optional List of which of the sensor_uuids to overwrite + :param depth_key: If sensor_uuids has depth_key substring, they are processed as depth + """ + + assert ( + len(fish_params) == 3 + ), "fish_params must have three parameters (f, xi, alpha)" + # fisheye camera parameters + fx = fish_params[0] * min(fish_shape) + fy = fx + cx = fish_shape[1] / 2 + cy = fish_shape[0] / 2 + xi = fish_params[1] + alpha = fish_params[2] + converter: ProjectionConverter = Cube2Fisheye( + fish_shape[0], fish_shape[1], fish_fov, cx, cy, fx, fy, xi, alpha + ) + + super(CubeMap2Fisheye, self).__init__( + converter, + sensor_uuids, + fish_shape, + channels_last, + target_uuids, + depth_key, + ) + + @classmethod + def from_config(cls, config): + cube2fish_config = config.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH + if hasattr(cube2fish_config, "TARGET_UUIDS"): + # Optional Config Value to specify target UUID + target_uuids = cube2fish_config.TARGET_UUIDS + else: + target_uuids = None + return cls( + cube2fish_config.SENSOR_UUIDS, + fish_shape=( + cube2fish_config.HEIGHT, + cube2fish_config.WIDTH, + ), + fish_fov=cube2fish_config.FOV, + fish_params=cube2fish_config.PARAMS, + target_uuids=target_uuids, + ) + + +class Equirect2Cube(ProjectionConverter): + """This is the backend Equirect2CubeMap that converts equirectangular image + to cubemap images.""" + + def __init__(self, img_h: int, img_w: int): + """Args: + img_h: (int) the height of the generated cubemap + img_w: (int) the width of the generated cubemap + """ + + # Equirectangular input + input_projection = EquirectProjection(256, 512) + + # Cubemap output + output_projections = get_cubemap_projections(img_h, img_w) + super(Equirect2Cube, self).__init__( + input_projection, output_projections + ) + + +@baseline_registry.register_obs_transformer() +class Equirect2CubeMap(ProjectionTransformer): + r"""This is an experimental use of ObservationTransformer that converts + an equirectangular image to cubemap images. + Cubemap order is Back, Down, Front, Left, Right, Up. + The output will be writen the UUID of the first sensor. + """ + + def __init__( + self, + sensor_uuids: List[str], + img_shape: Tuple[int, int], + channels_last: bool = False, + target_uuids: Optional[List[str]] = None, + depth_key: str = "depth", + ): + r""":param sensor_uuids: List of sensor_uuids: Back, Down, Front, Left, Right, Up. + :param img_shape: The shape of the equirectangular output (height, width) + :param channels_last: Are the channels last in the input + :param target_uuids: Optional List of which of the sensor_uuids to overwrite + :param depth_key: If sensor_uuids has depth_key substring, they are processed as depth + """ + + converter = Equirect2Cube(img_shape[0], img_shape[1]) + super(Equirect2CubeMap, self).__init__( + converter, + sensor_uuids, + img_shape, + channels_last, + target_uuids, + depth_key, + ) + + @classmethod + def from_config(cls, config): + eq2cube_config = config.RL.POLICY.OBS_TRANSFORMS.EQ2CUBE + + if hasattr(eq2cube_config, "TARGET_UUIDS"): + # Optional Config Value to specify target UUID + target_uuids = eq2cube_config.TARGET_UUIDS + else: + target_uuids = None + return cls( + eq2cube_config.SENSOR_UUIDS, + img_shape=( + eq2cube_config.HEIGHT, + eq2cube_config.WIDTH, + ), + target_uuids=target_uuids, + ) + + +def get_active_obs_transforms(config: Config) -> List[ObservationTransformer]: + active_obs_transforms = [] + if hasattr(config.RL.POLICY, "OBS_TRANSFORMS"): + obs_transform_names = ( + config.RL.POLICY.OBS_TRANSFORMS.ENABLED_TRANSFORMS + ) + for obs_transform_name in obs_transform_names: + obs_trans_cls = baseline_registry.get_obs_transformer( + obs_transform_name + ) + obs_transform = obs_trans_cls.from_config(config) + active_obs_transforms.append(obs_transform) + return active_obs_transforms + + +def apply_obs_transforms_batch( + batch: Dict[str, torch.Tensor], + obs_transforms: Iterable[ObservationTransformer], +) -> Dict[str, torch.Tensor]: + for obs_transform in obs_transforms: + batch = obs_transform(batch) + return batch + + +def apply_obs_transforms_obs_space( + obs_space: spaces.Dict, obs_transforms: Iterable[ObservationTransformer] +) -> spaces.Dict: + for obs_transform in obs_transforms: + obs_space = obs_transform.transform_observation_space(obs_space) + return obs_space diff --git a/habitat-lab-dialog/habitat_baselines/common/rollout_storage.py b/habitat-lab-dialog/habitat_baselines/common/rollout_storage.py new file mode 100644 index 0000000..b0b9910 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/rollout_storage.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import warnings + +import numpy as np +import torch + +from habitat_baselines.common.tensor_dict import TensorDict + + +class RolloutStorage: + r"""Class for storing rollout information for RL trainers.""" + + def __init__( + self, + numsteps, + num_envs, + observation_space, + action_space, + recurrent_hidden_state_size, + num_recurrent_layers=1, + is_double_buffered: bool = False, + ): + self.buffers = TensorDict() + self.buffers["observations"] = TensorDict() + + for sensor in observation_space.spaces: + self.buffers["observations"][sensor] = torch.from_numpy( + np.zeros( + ( + numsteps + 1, + num_envs, + *observation_space.spaces[sensor].shape, + ), + dtype=observation_space.spaces[sensor].dtype, + ) + ) + + self.buffers["recurrent_hidden_states"] = torch.zeros( + numsteps + 1, + num_envs, + num_recurrent_layers, + recurrent_hidden_state_size, + ) + + self.buffers["rewards"] = torch.zeros(numsteps + 1, num_envs, 1) + self.buffers["value_preds"] = torch.zeros(numsteps + 1, num_envs, 1) + self.buffers["returns"] = torch.zeros(numsteps + 1, num_envs, 1) + + self.buffers["action_log_probs"] = torch.zeros( + numsteps + 1, num_envs, 1 + ) + if action_space.__class__.__name__ == "ActionSpace": + action_shape = 1 + else: + action_shape = action_space.shape[0] + + self.buffers["actions"] = torch.zeros( + numsteps + 1, num_envs, action_shape + ) + self.buffers["prev_actions"] = torch.zeros( + numsteps + 1, num_envs, action_shape + ) + if action_space.__class__.__name__ == "ActionSpace": + self.buffers["actions"] = self.buffers["actions"].long() + self.buffers["prev_actions"] = self.buffers["prev_actions"].long() + + self.buffers["masks"] = torch.zeros( + numsteps + 1, num_envs, 1, dtype=torch.bool + ) + + self.is_double_buffered = is_double_buffered + self._nbuffers = 2 if is_double_buffered else 1 + self._num_envs = num_envs + + assert (self._num_envs % self._nbuffers) == 0 + + self.numsteps = numsteps + self.current_rollout_step_idxs = [0 for _ in range(self._nbuffers)] + + @property + def current_rollout_step_idx(self) -> int: + assert all( + s == self.current_rollout_step_idxs[0] + for s in self.current_rollout_step_idxs + ) + return self.current_rollout_step_idxs[0] + + def to(self, device): + self.buffers.map_in_place(lambda v: v.to(device)) + + def insert( + self, + next_observations=None, + next_recurrent_hidden_states=None, + actions=None, + action_log_probs=None, + value_preds=None, + rewards=None, + next_masks=None, + buffer_index: int = 0, + ): + if not self.is_double_buffered: + assert buffer_index == 0 + + next_step = dict( + observations=next_observations, + recurrent_hidden_states=next_recurrent_hidden_states, + prev_actions=actions, + masks=next_masks, + ) + + current_step = dict( + actions=actions, + action_log_probs=action_log_probs, + value_preds=value_preds, + rewards=rewards, + ) + + next_step = {k: v for k, v in next_step.items() if v is not None} + current_step = {k: v for k, v in current_step.items() if v is not None} + + env_slice = slice( + int(buffer_index * self._num_envs / self._nbuffers), + int((buffer_index + 1) * self._num_envs / self._nbuffers), + ) + + if len(next_step) > 0: + self.buffers.set( + (self.current_rollout_step_idxs[buffer_index] + 1, env_slice), + next_step, + strict=False, + ) + + if len(current_step) > 0: + self.buffers.set( + (self.current_rollout_step_idxs[buffer_index], env_slice), + current_step, + strict=False, + ) + + def advance_rollout(self, buffer_index: int = 0): + self.current_rollout_step_idxs[buffer_index] += 1 + + def after_update(self): + self.buffers[0] = self.buffers[self.current_rollout_step_idx] + + self.current_rollout_step_idxs = [ + 0 for _ in self.current_rollout_step_idxs + ] + + def compute_returns(self, next_value, use_gae, gamma, tau): + if use_gae: + self.buffers["value_preds"][ + self.current_rollout_step_idx + ] = next_value + gae = 0 + for step in reversed(range(self.current_rollout_step_idx)): + delta = ( + self.buffers["rewards"][step] + + gamma + * self.buffers["value_preds"][step + 1] + * self.buffers["masks"][step + 1] + - self.buffers["value_preds"][step] + ) + gae = ( + delta + gamma * tau * gae * self.buffers["masks"][step + 1] + ) + self.buffers["returns"][step] = ( + gae + self.buffers["value_preds"][step] + ) + else: + self.buffers["returns"][self.current_rollout_step_idx] = next_value + for step in reversed(range(self.current_rollout_step_idx)): + self.buffers["returns"][step] = ( + gamma + * self.buffers["returns"][step + 1] + * self.buffers["masks"][step + 1] + + self.buffers["rewards"][step] + ) + + def recurrent_generator(self, advantages, num_mini_batch) -> TensorDict: + num_environments = advantages.size(1) + assert num_environments >= num_mini_batch, ( + "Trainer requires the number of environments ({}) " + "to be greater than or equal to the number of " + "trainer mini batches ({}).".format( + num_environments, num_mini_batch + ) + ) + if num_environments % num_mini_batch != 0: + warnings.warn( + "Number of environments ({}) is not a multiple of the" + " number of mini batches ({}). This results in mini batches" + " of different sizes, which can harm training performance.".format( + num_environments, num_mini_batch + ) + ) + for inds in torch.randperm(num_environments).chunk(num_mini_batch): + batch = self.buffers[0 : self.current_rollout_step_idx, inds] + batch["advantages"] = advantages[ + 0 : self.current_rollout_step_idx, inds + ] + batch["recurrent_hidden_states"] = batch[ + "recurrent_hidden_states" + ][0:1] + + yield batch.map(lambda v: v.flatten(0, 1)) diff --git a/habitat-lab-dialog/habitat_baselines/common/tensor_dict.py b/habitat-lab-dialog/habitat_baselines/common/tensor_dict.py new file mode 100644 index 0000000..94b3d25 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/tensor_dict.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import copy +import numbers +from typing import Callable, Dict, Optional, Tuple, Union, overload + +import numpy as np +import torch + +TensorLike = Union[torch.Tensor, np.ndarray, numbers.Real] +DictTree = Dict[str, Union[TensorLike, "DictTree"]] +TensorIndexType = Union[int, slice, Tuple[Union[int, slice], ...]] + + +class TensorDict(Dict[str, Union["TensorDict", torch.Tensor]]): + r"""A dictionary of tensors that can be indexed like a tensor or like a dictionary. + + .. code:: py + t = TensorDict(a=torch.randn(2, 2), b=TensorDict(c=torch.randn(3, 3))) + + print(t) + + print(t[0, 0]) + + print(t["a"]) + + """ + + @classmethod + def from_tree(cls, tree: DictTree) -> "TensorDict": + res = cls() + for k, v in tree.items(): + if isinstance(v, dict): + res[k] = cls.from_tree(v) + else: + res[k] = torch.as_tensor(v) + + return res + + def to_tree(self) -> DictTree: + res: DictTree = dict() + for k, v in self.items(): + if isinstance(v, TensorDict): + res[k] = v.to_tree() + else: + res[k] = v + + return res + + @overload + def __getitem__(self, index: str) -> Union["TensorDict", torch.Tensor]: + ... + + @overload + def __getitem__(self, index: TensorIndexType) -> "TensorDict": + ... + + def __getitem__( + self, index: Union[str, TensorIndexType] + ) -> Union["TensorDict", torch.Tensor]: + if isinstance(index, str): + return super().__getitem__(index) + else: + return TensorDict({k: v[index] for k, v in self.items()}) + + @overload + def set( + self, + index: str, + value: Union[TensorLike, "TensorDict", DictTree], + strict: bool = True, + ) -> None: + ... + + @overload + def set( + self, + index: TensorIndexType, + value: Union["TensorDict", DictTree], + strict: bool = True, + ) -> None: + ... + + def set( + self, + index: Union[str, TensorIndexType], + value: Union[TensorLike, "TensorDict"], + strict: bool = True, + ) -> None: + if isinstance(index, str): + super().__setitem__(index, value) + else: + if strict and (self.keys() != value.keys()): + raise KeyError( + "Keys don't match: Dest={} Source={}".format( + self.keys(), value.keys() + ) + ) + + for k in self.keys(): + if k not in value: + if strict: + raise KeyError(f"Key {k} not in new value dictionary") + else: + continue + + v = value[k] + + if isinstance(v, (TensorDict, dict)): + self[k].set(index, v, strict=strict) + else: + self[k][index].copy_(torch.as_tensor(v)) + + def __setitem__( + self, + index: Union[str, TensorIndexType], + value: Union[torch.Tensor, "TensorDict"], + ): + self.set(index, value) + + @classmethod + def map_func( + cls, + func: Callable[[torch.Tensor], torch.Tensor], + src: "TensorDict", + dst: Optional["TensorDict"] = None, + ) -> "TensorDict": + if dst is None: + dst = TensorDict() + + for k, v in src.items(): + if torch.is_tensor(v): + dst[k] = func(v) + else: + dst[k] = cls.map_func(func, v, dst.get(k, None)) + + return dst + + def map( + self, func: Callable[[torch.Tensor], torch.Tensor] + ) -> "TensorDict": + return self.map_func(func, self) + + def map_in_place( + self, func: Callable[[torch.Tensor], torch.Tensor] + ) -> "TensorDict": + return self.map_func(func, self, self) + + def __deepcopy__(self, _memo=None) -> "TensorDict": + return TensorDict.from_tree(copy.deepcopy(self.to_tree(), memo=_memo)) diff --git a/habitat-lab-dialog/habitat_baselines/common/tensorboard_utils.py b/habitat-lab-dialog/habitat_baselines/common/tensorboard_utils.py new file mode 100644 index 0000000..0958386 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/common/tensorboard_utils.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any + +import numpy as np +import torch +from torch.utils.tensorboard import SummaryWriter + + +class TensorboardWriter: + def __init__(self, log_dir: str, *args: Any, **kwargs: Any): + r"""A Wrapper for tensorboard SummaryWriter. It creates a dummy writer + when log_dir is empty string or None. It also has functionality that + generates tb video directly from numpy images. + + Args: + log_dir: Save directory location. Will not write to disk if + log_dir is an empty string. + *args: Additional positional args for SummaryWriter + **kwargs: Additional keyword args for SummaryWriter + """ + self.writer = None + if log_dir is not None and len(log_dir) > 0: + self.writer = SummaryWriter(log_dir, *args, **kwargs) + + def __getattr__(self, item): + if self.writer: + return self.writer.__getattribute__(item) + else: + return lambda *args, **kwargs: None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.writer: + self.writer.close() + + def add_video_from_np_images( + self, video_name: str, step_idx: int, images: np.ndarray, fps: int = 10 + ) -> None: + r"""Write video into tensorboard from images frames. + + Args: + video_name: name of video string. + step_idx: int of checkpoint index to be displayed. + images: list of n frames. Each frame is a np.ndarray of shape. + fps: frame per second for output video. + + Returns: + None. + """ + if not self.writer: + return + # initial shape of np.ndarray list: N * (H, W, 3) + frame_tensors = [ + torch.from_numpy(np_arr).unsqueeze(0) for np_arr in images + ] + video_tensor = torch.cat(tuple(frame_tensors)) + video_tensor = video_tensor.permute(0, 3, 1, 2).unsqueeze(0) + # final shape of video tensor: (1, n, 3, H, W) + self.writer.add_video( + video_name, video_tensor, fps=fps, global_step=step_idx + ) diff --git a/habitat-lab-dialog/habitat_baselines/config/__init__.py b/habitat-lab-dialog/habitat_baselines/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/habitat-lab-dialog/habitat_baselines/config/default.py b/habitat-lab-dialog/habitat_baselines/config/default.py new file mode 100644 index 0000000..fa3c177 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/default.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import warnings +from typing import List, Optional, Union + +import numpy as np + +from habitat import get_config as get_task_config +from habitat.config import Config as CN + +DEFAULT_CONFIG_DIR = "configs/" +CONFIG_FILE_SEPARATOR = "," +# ----------------------------------------------------------------------------- +# EXPERIMENT CONFIG +# ----------------------------------------------------------------------------- +_C = CN() +# task config can be a list of conifgs like "A.yaml,B.yaml" +_C.BASE_TASK_CONFIG_PATH = "configs/tasks/pointnav.yaml" +_C.TASK_CONFIG = CN() # task_config will be stored as a config node +_C.CMD_TRAILING_OPTS = [] # store command line options as list of strings +_C.TRAINER_NAME = "ppo" +_C.ENV_NAME = "NavRLEnv" +_C.SIMULATOR_GPU_ID = 0 +_C.TORCH_GPU_ID = 0 +_C.VIDEO_OPTION = ["disk", "tensorboard"] +_C.TENSORBOARD_DIR = "tb" +_C.VIDEO_DIR = "video_dir" +_C.TEST_EPISODE_COUNT = -1 +_C.EVAL_CKPT_PATH_DIR = "data/checkpoints" # path to ckpt or path to ckpts dir +_C.NUM_ENVIRONMENTS = 16 +_C.NUM_PROCESSES = -1 # depricated +_C.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] +_C.CHECKPOINT_FOLDER = "data/checkpoints" +_C.NUM_UPDATES = 10000 +_C.NUM_CHECKPOINTS = 10 +# Number of model updates between checkpoints +_C.CHECKPOINT_INTERVAL = -1 +_C.TOTAL_NUM_STEPS = -1.0 +_C.LOG_INTERVAL = 10 +_C.LOG_FILE = "train.log" +_C.FORCE_BLIND_POLICY = False +_C.VERBOSE = True +# ----------------------------------------------------------------------------- +# EVAL CONFIG +# ----------------------------------------------------------------------------- +_C.EVAL = CN() +# The split to evaluate on +_C.EVAL.SPLIT = "val" +_C.EVAL.USE_CKPT_CONFIG = True +# ----------------------------------------------------------------------------- +# REINFORCEMENT LEARNING (RL) ENVIRONMENT CONFIG +# ----------------------------------------------------------------------------- +_C.RL = CN() +_C.RL.REWARD_MEASURE = "distance_to_goal" +_C.RL.SUCCESS_MEASURE = "spl" +_C.RL.SUCCESS_REWARD = 2.5 +_C.RL.SLACK_REWARD = -0.01 +# ----------------------------------------------------------------------------- +# POLICY CONFIG +# ----------------------------------------------------------------------------- +_C.RL.POLICY = CN() +_C.RL.POLICY.name = "PointNavResNetPolicy" +# ----------------------------------------------------------------------------- +# OBS_TRANSFORMS CONFIG +# ----------------------------------------------------------------------------- +_C.RL.POLICY.OBS_TRANSFORMS = CN() +_C.RL.POLICY.OBS_TRANSFORMS.ENABLED_TRANSFORMS = tuple() +_C.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER = CN() +_C.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER.HEIGHT = 256 +_C.RL.POLICY.OBS_TRANSFORMS.CENTER_CROPPER.WIDTH = 256 +_C.RL.POLICY.OBS_TRANSFORMS.RESIZE_SHORTEST_EDGE = CN() +_C.RL.POLICY.OBS_TRANSFORMS.RESIZE_SHORTEST_EDGE.SIZE = 256 +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2EQ = CN() +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2EQ.HEIGHT = 256 +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2EQ.WIDTH = 512 +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2EQ.SENSOR_UUIDS = list() +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH = CN() +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH.HEIGHT = 256 +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH.WIDTH = 256 +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH.FOV = 180 +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH.PARAMS = (0.2, 0.2, 0.2) +_C.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH.SENSOR_UUIDS = list() +_C.RL.POLICY.OBS_TRANSFORMS.EQ2CUBE = CN() +_C.RL.POLICY.OBS_TRANSFORMS.EQ2CUBE.HEIGHT = 256 +_C.RL.POLICY.OBS_TRANSFORMS.EQ2CUBE.WIDTH = 256 +_C.RL.POLICY.OBS_TRANSFORMS.EQ2CUBE.SENSOR_UUIDS = list() +# ----------------------------------------------------------------------------- +# PROXIMAL POLICY OPTIMIZATION (PPO) +# ----------------------------------------------------------------------------- +_C.RL.PPO = CN() +_C.RL.PPO.clip_param = 0.2 +_C.RL.PPO.ppo_epoch = 4 +_C.RL.PPO.num_mini_batch = 2 +_C.RL.PPO.value_loss_coef = 0.5 +_C.RL.PPO.entropy_coef = 0.01 +_C.RL.PPO.lr = 2.5e-4 +_C.RL.PPO.eps = 1e-5 +_C.RL.PPO.max_grad_norm = 0.5 +_C.RL.PPO.num_steps = 5 +_C.RL.PPO.use_gae = True +_C.RL.PPO.use_linear_lr_decay = False +_C.RL.PPO.use_linear_clip_decay = False +_C.RL.PPO.gamma = 0.99 +_C.RL.PPO.tau = 0.95 +_C.RL.PPO.reward_window_size = 50 +_C.RL.PPO.use_normalized_advantage = False +_C.RL.PPO.hidden_size = 512 +# Use double buffered sampling, typically helps +# when environment time is similar or large than +# policy inference time during rollout generation +# Not that this does not change the memory requirements +_C.RL.PPO.use_double_buffered_sampler = False +# ----------------------------------------------------------------------------- +# DECENTRALIZED DISTRIBUTED PROXIMAL POLICY OPTIMIZATION (DD-PPO) +# ----------------------------------------------------------------------------- +_C.RL.DDPPO = CN() +_C.RL.DDPPO.sync_frac = 0.6 +_C.RL.DDPPO.distrib_backend = "GLOO" +_C.RL.DDPPO.rnn_type = "GRU" +_C.RL.DDPPO.num_recurrent_layers = 1 +_C.RL.DDPPO.backbone = "resnet18" +_C.RL.DDPPO.pretrained_weights = "data/ddppo-models/gibson-2plus-resnet50.pth" +# Loads pretrained weights +_C.RL.DDPPO.pretrained = False +# Loads just the visual encoder backbone weights +_C.RL.DDPPO.pretrained_encoder = False +# Whether or not the visual encoder backbone will be trained +_C.RL.DDPPO.train_encoder = True +# Whether or not to reset the critic linear layer +_C.RL.DDPPO.reset_critic = True +# Forces distributed mode for testing +_C.RL.DDPPO.force_distributed = False +# ----------------------------------------------------------------------------- +# ORBSLAM2 BASELINE +# ----------------------------------------------------------------------------- +_C.ORBSLAM2 = CN() +_C.ORBSLAM2.SLAM_VOCAB_PATH = "habitat_baselines/slambased/data/ORBvoc.txt" +_C.ORBSLAM2.SLAM_SETTINGS_PATH = ( + "habitat_baselines/slambased/data/mp3d3_small1k.yaml" +) +_C.ORBSLAM2.MAP_CELL_SIZE = 0.1 +_C.ORBSLAM2.MAP_SIZE = 40 +_C.ORBSLAM2.CAMERA_HEIGHT = get_task_config().SIMULATOR.DEPTH_SENSOR.POSITION[ + 1 +] +_C.ORBSLAM2.BETA = 100 +_C.ORBSLAM2.H_OBSTACLE_MIN = 0.3 * _C.ORBSLAM2.CAMERA_HEIGHT +_C.ORBSLAM2.H_OBSTACLE_MAX = 1.0 * _C.ORBSLAM2.CAMERA_HEIGHT +_C.ORBSLAM2.D_OBSTACLE_MIN = 0.1 +_C.ORBSLAM2.D_OBSTACLE_MAX = 4.0 +_C.ORBSLAM2.PREPROCESS_MAP = True +_C.ORBSLAM2.MIN_PTS_IN_OBSTACLE = ( + get_task_config().SIMULATOR.DEPTH_SENSOR.WIDTH / 2.0 +) +_C.ORBSLAM2.ANGLE_TH = float(np.deg2rad(15)) +_C.ORBSLAM2.DIST_REACHED_TH = 0.15 +_C.ORBSLAM2.NEXT_WAYPOINT_TH = 0.5 +_C.ORBSLAM2.NUM_ACTIONS = 3 +_C.ORBSLAM2.DIST_TO_STOP = 0.05 +_C.ORBSLAM2.PLANNER_MAX_STEPS = 500 +_C.ORBSLAM2.DEPTH_DENORM = get_task_config().SIMULATOR.DEPTH_SENSOR.MAX_DEPTH +# ----------------------------------------------------------------------------- +# PROFILING +# ----------------------------------------------------------------------------- +_C.PROFILING = CN() +_C.PROFILING.CAPTURE_START_STEP = -1 +_C.PROFILING.NUM_STEPS_TO_CAPTURE = -1 + + +_C.register_renamed_key + + +def get_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None, +) -> CN: + r"""Create a unified config with default values overwritten by values from + :ref:`config_paths` and overwritten by options from :ref:`opts`. + + Args: + config_paths: List of config paths or string that contains comma + separated list of config paths. + opts: Config options (keys, values) in a list (e.g., passed from + command line into the config. For example, ``opts = ['FOO.BAR', + 0.5]``. Argument can be used for parameter sweeping or quick tests. + """ + config = _C.clone() + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + + if opts: + for k, v in zip(opts[0::2], opts[1::2]): + if k == "BASE_TASK_CONFIG_PATH": + config.BASE_TASK_CONFIG_PATH = v + + config.TASK_CONFIG = get_task_config(config.BASE_TASK_CONFIG_PATH) + if opts: + config.CMD_TRAILING_OPTS = config.CMD_TRAILING_OPTS + opts + config.merge_from_list(config.CMD_TRAILING_OPTS) + + if config.NUM_PROCESSES != -1: + warnings.warn( + "NUM_PROCESSES is depricated and will be removed in a future version." + " Use NUM_ENVIRONMENTS instead." + " Overwriting NUM_ENVIRONMENTS with NUM_PROCESSES for backwards compatibility." + ) + + config.NUM_ENVIRONMENTS = config.NUM_PROCESSES + + config.freeze() + return config diff --git a/habitat-lab-dialog/habitat_baselines/config/eqa/il_eqa_cnn_pretrain.yaml b/habitat-lab-dialog/habitat_baselines/config/eqa/il_eqa_cnn_pretrain.yaml new file mode 100644 index 0000000..ea34ec2 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/eqa/il_eqa_cnn_pretrain.yaml @@ -0,0 +1,27 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/eqa_mp3d.yaml" +TRAINER_NAME: "eqa-cnn-pretrain" + +ENV_NAME: "EQACNNPretrainILEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 + +DATASET_PATH: "data/datasets/eqa/eqa_cnn_pretrain/{split}/{split}.db" + +EVAL_CKPT_PATH_DIR: "data/eqa/eqa_cnn_pretrain/checkpoints/epoch_5.ckpt" + +CHECKPOINT_FOLDER: "data/eqa/eqa_cnn_pretrain/checkpoints" +TENSORBOARD_DIR: "data/eqa/eqa_cnn_pretrain/tb" +RESULTS_DIR: "data/eqa/eqa_cnn_pretrain/results/{split}/{type}" +OUTPUT_LOG_DIR: data/eqa/eqa_cnn_pretrain/logs + +LOG_METRICS: True +LOG_INTERVAL: 50 +EVAL_SAVE_RESULTS: True +EVAL_SAVE_RESULTS_INTERVAL: 50 + +IL: + EQACNNPretrain: + # params + max_epochs: 5 + batch_size: 20 + lr: 1e-3 diff --git a/habitat-lab-dialog/habitat_baselines/config/eqa/il_pacman_nav.yaml b/habitat-lab-dialog/habitat_baselines/config/eqa/il_pacman_nav.yaml new file mode 100644 index 0000000..684d584 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/eqa/il_pacman_nav.yaml @@ -0,0 +1,34 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/eqa_mp3d.yaml" +TRAINER_NAME: "pacman" + +ENV_NAME: "NavILEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 + +ONLY_VQA_TASK: False # if True, only last `num_frames` will be saved to disk. +#if False, all frames for each episode are saved to disk (for NAV task later) +FRAME_DATASET_PATH: "data/datasets/eqa/frame_dataset/{split}" +EVAL_CKPT_PATH_DIR: "data/eqa/nav/checkpoints/" +EQA_CNN_PRETRAIN_CKPT_PATH: "data/eqa/eqa_cnn_pretrain/checkpoints/epoch_5.ckpt" + +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/eqa/nav/checkpoints" +TENSORBOARD_DIR: "data/eqa/nav/tb" +RESULTS_DIR: "data/eqa/nav/results/{split}" + +LOG_METRICS: True +OUTPUT_LOG_DIR: data/eqa/nav/logs +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 1 +EVAL_SAVE_RESULTS: True +EVAL_SAVE_RESULTS_INTERVAL: 10 + +IL: + NAV: + # nav params + max_epochs: 20 + batch_size: 20 + lr: 1e-3 + max_controller_actions: 5 + max_episode_length: 100 diff --git a/habitat-lab-dialog/habitat_baselines/config/eqa/il_vqa.yaml b/habitat-lab-dialog/habitat_baselines/config/eqa/il_vqa.yaml new file mode 100644 index 0000000..c0f9f31 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/eqa/il_vqa.yaml @@ -0,0 +1,33 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/eqa_mp3d.yaml" +TRAINER_NAME: "vqa" + +ENV_NAME: "VQAILEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 + +ONLY_VQA_TASK: False # if True, only last `num_frames` will be saved to disk. +#if False, all frames for each episode are saved to disk (for NAV task later) +DATASET_PATH: "data/datasets/eqa/frame_dataset/{split}/{split}.db" +FRAME_DATASET_PATH: "data/datasets/eqa/frame_dataset/{split}" +EVAL_CKPT_PATH_DIR: "data/eqa/vqa/checkpoints/" +EQA_CNN_PRETRAIN_CKPT_PATH: "data/eqa/eqa_cnn_pretrain/checkpoints/epoch_5.ckpt" + +SENSORS: ["RGB_SENSOR"] +CHECKPOINT_FOLDER: "data/eqa/vqa/checkpoints/" +TENSORBOARD_DIR: "data/eqa/vqa/tb" +RESULTS_DIR: "data/eqa/vqa/results/{split}" + +LOG_METRICS: True +OUTPUT_LOG_DIR: "data/eqa/vqa/logs" +LOG_INTERVAL: 100 +EVAL_SAVE_RESULTS: True +EVAL_SAVE_RESULTS_INTERVAL: 10 + +IL: + VQA: + # vqa params + num_frames: 5 + max_epochs: 50 + batch_size: 20 + lr: 3e-4 + freeze_encoder: False diff --git a/habitat-lab-dialog/habitat_baselines/config/imagenav/ddppo_imagenav_example.yaml b/habitat-lab-dialog/habitat_baselines/config/imagenav/ddppo_imagenav_example.yaml new file mode 100644 index 0000000..0d7993a --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/imagenav/ddppo_imagenav_example.yaml @@ -0,0 +1,66 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/imagenav.yaml" +TRAINER_NAME: "ddppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +TEST_EPISODE_COUNT: -1 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 4 +SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/new_checkpoints" +NUM_UPDATES: -1 +TOTAL_NUM_STEPS: 1e6 +LOG_INTERVAL: 10 +NUM_CHECKPOINTS: 10 + + +RL: + SUCCESS_REWARD: 2.5 + SLACK_REWARD: -1e-4 + + POLICY: + name: "PointNavResNetPolicy" + + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + num_steps: 64 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + reward_window_size: 50 + + use_normalized_advantage: False + + hidden_size: 512 + DDPPO: + sync_frac: 0.6 + # The PyTorch distributed backend to use + distrib_backend: GLOO + # Visual encoder backbone + pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth + # Initialize with pretrained weights + pretrained: False + # Initialize just the visual encoder backbone with pretrained weights + pretrained_encoder: False + # Whether or not the visual encoder backbone will be trained. + train_encoder: True + # Whether or not to reset the critic linear layer + reset_critic: True + + # Model parameters + backbone: resnet50 + rnn_type: LSTM + num_recurrent_layers: 2 diff --git a/habitat-lab-dialog/habitat_baselines/config/imagenav/ddppo_imagenav_gibson.yaml b/habitat-lab-dialog/habitat_baselines/config/imagenav/ddppo_imagenav_gibson.yaml new file mode 100644 index 0000000..acde8f2 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/imagenav/ddppo_imagenav_gibson.yaml @@ -0,0 +1,71 @@ +VERBOSE: False +BASE_TASK_CONFIG_PATH: "configs/tasks/imagenav_gibson.yaml" +TRAINER_NAME: "ddppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +TEST_EPISODE_COUNT: -1 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 4 +SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/new_checkpoints" +NUM_UPDATES: -1 +TOTAL_NUM_STEPS: 1e9 +LOG_INTERVAL: 100 +NUM_CHECKPOINTS: 100 + +RL: + SUCCESS_REWARD: 2.5 + SLACK_REWARD: -1e-4 + + POLICY: + name: "PointNavResNetPolicy" + + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + num_steps: 64 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + reward_window_size: 50 + + use_normalized_advantage: False + + hidden_size: 512 + + # Use double buffered sampling, typically helps + # when environment time is similar or large than + # policy inference time during rollout generation + use_double_buffered_sampler: False + DDPPO: + sync_frac: 0.6 + # The PyTorch distributed backend to use + distrib_backend: NCCL + # Visual encoder backbone + pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth + # Initialize with pretrained weights + pretrained: False + # Initialize just the visual encoder backbone with pretrained weights + pretrained_encoder: False + # Whether or not the visual encoder backbone will be trained. + train_encoder: True + # Whether or not to reset the critic linear layer + reset_critic: True + + # Model parameters + backbone: resnet50 + rnn_type: LSTM + num_recurrent_layers: 2 diff --git a/habitat-lab-dialog/habitat_baselines/config/imagenav/ppo_imagenav_example.yaml b/habitat-lab-dialog/habitat_baselines/config/imagenav/ppo_imagenav_example.yaml new file mode 100644 index 0000000..5007057 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/imagenav/ppo_imagenav_example.yaml @@ -0,0 +1,37 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/imagenav.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: ["disk", "tensorboard"] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +TEST_EPISODE_COUNT: 2 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 1 +SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/new_checkpoints" +NUM_UPDATES: -1 +TOTAL_NUM_STEPS: 1e6 +LOG_INTERVAL: 10 +NUM_CHECKPOINTS: 10 + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + num_steps: 128 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + reward_window_size: 50 diff --git a/habitat-lab-dialog/habitat_baselines/config/objectnav/ddppo_objectnav.yaml b/habitat-lab-dialog/habitat_baselines/config/objectnav/ddppo_objectnav.yaml new file mode 100644 index 0000000..273654a --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/objectnav/ddppo_objectnav.yaml @@ -0,0 +1,69 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/objectnav_mp3d.yaml" +CMD_TRAILING_OPTS: ["TASK_CONFIG.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS", "50000"] +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +TEST_EPISODE_COUNT: -1 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 4 +CHECKPOINT_FOLDER: "data/new_checkpoints" +TRAINER_NAME: "ddppo" +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 270000 +LOG_INTERVAL: 10 +NUM_CHECKPOINTS: 100 + +EVAL: + SPLIT: "val" + +RL: + SUCCESS_REWARD: 2.5 + SLACK_REWARD: -1e-3 + + POLICY: + name: "PointNavResNetPolicy" + + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 4 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + num_steps: 64 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + reward_window_size: 50 + + use_normalized_advantage: False + + hidden_size: 512 + + DDPPO: + sync_frac: 0.6 + # The PyTorch distributed backend to use + distrib_backend: NCCL + # Visual encoder backbone + pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth + # Initialize with pretrained weights + pretrained: False + # Initialize just the visual encoder backbone with pretrained weights + pretrained_encoder: False + # Whether or not the visual encoder backbone will be trained. + train_encoder: True + # Whether or not to reset the critic linear layer + reset_critic: True + + # Model parameters + backbone: resnet50 + rnn_type: LSTM + num_recurrent_layers: 2 diff --git a/habitat-lab-dialog/habitat_baselines/config/pointnav/ddppo_pointnav.yaml b/habitat-lab-dialog/habitat_baselines/config/pointnav/ddppo_pointnav.yaml new file mode 100644 index 0000000..2189a9d --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/pointnav/ddppo_pointnav.yaml @@ -0,0 +1,71 @@ +VERBOSE: False +BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav_gibson.yaml" +TRAINER_NAME: "ddppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +TEST_EPISODE_COUNT: -1 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 4 +SENSORS: ["DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/new_checkpoints" +NUM_UPDATES: -1 +TOTAL_NUM_STEPS: 2.5e9 +LOG_INTERVAL: 10 +NUM_CHECKPOINTS: 100 + +RL: + SUCCESS_REWARD: 2.5 + + POLICY: + name: "PointNavResNetPolicy" + + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + num_steps: 128 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + reward_window_size: 50 + + use_normalized_advantage: False + + hidden_size: 512 + + # Use double buffered sampling, typically helps + # when environment time is similar or large than + # policy inference time during rollout generation + use_double_buffered_sampler: False + + DDPPO: + sync_frac: 0.6 + # The PyTorch distributed backend to use + distrib_backend: NCCL + # Visual encoder backbone + pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth + # Initialize with pretrained weights + pretrained: False + # Initialize just the visual encoder backbone with pretrained weights + pretrained_encoder: False + # Whether or not the visual encoder backbone will be trained. + train_encoder: True + # Whether or not to reset the critic linear layer + reset_critic: True + + # Model parameters + backbone: resnet50 + rnn_type: LSTM + num_recurrent_layers: 2 diff --git a/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav.yaml b/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav.yaml new file mode 100644 index 0000000..efd3043 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav.yaml @@ -0,0 +1,48 @@ +# Hyperparameters and ResNet18 from on https://arxiv.org/abs/2012.0611 + +VERBOSE: False + +BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav_gibson.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +# Can be uncommented to generate videos. +# VIDEO_OPTION: ["disk", "tensorboard"] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +# Evaluate on all episodes +TEST_EPISODE_COUNT: -1 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 6 +SENSORS: ["DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/new_checkpoints" +TOTAL_NUM_STEPS: 75e6 +LOG_INTERVAL: 25 +NUM_CHECKPOINTS: 100 + +RL: + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 4 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + num_steps: 128 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + reward_window_size: 50 + + # Use double buffered sampling, typically helps + # when environment time is similar or large than + # policy inference time during rollout generation + use_double_buffered_sampler: False diff --git a/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav_example.yaml b/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav_example.yaml new file mode 100644 index 0000000..b437ea8 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav_example.yaml @@ -0,0 +1,41 @@ +# Note: This is an example config, see habitat_baselines/config/pointnav/ppo_pointnav.yaml +# for better hyperparameters for actual training + +BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: ["disk", "tensorboard"] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +# To evaluate on all episodes, set this to -1 +TEST_EPISODE_COUNT: 2 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +NUM_ENVIRONMENTS: 1 +SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/new_checkpoints" +NUM_UPDATES: -1 +TOTAL_NUM_STEPS: 1e6 +LOG_INTERVAL: 10 +NUM_CHECKPOINTS: 50 + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 1 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + num_steps: 32 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + reward_window_size: 50 diff --git a/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav_habitat_iccv19.yaml b/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav_habitat_iccv19.yaml new file mode 100644 index 0000000..a43a806 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/pointnav/ppo_pointnav_habitat_iccv19.yaml @@ -0,0 +1,53 @@ +# Note this config here only for reproducibility's sake. +# ppo_pointnav.yaml contains the known best practices +# and should be used as the starting point instead. + +VERBOSE: False + +BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav_gibson.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +# Can be uncommented to generate videos. +# VIDEO_OPTION: ["disk", "tensorboard"] +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +# Evaluate on all episodes +TEST_EPISODE_COUNT: -1 +EVAL_CKPT_PATH_DIR: "data/new_checkpoints" +# This was 6 for mp3d and 8 for gibson in the paper +NUM_ENVIRONMENTS: 6 +SENSORS: ["DEPTH_SENSOR"] +CHECKPOINT_FOLDER: "data/new_checkpoints" +TOTAL_NUM_STEPS: 75e6 +LOG_INTERVAL: 25 +NUM_CHECKPOINTS: 100 + +RL: + SUCCESS_REWARD: 10.0 + + POLICY: + name: "PointNavBaselinePolicy" + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 4 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + num_steps: 128 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + reward_window_size: 50 + + use_normalized_advantage: True diff --git a/habitat-lab-dialog/habitat_baselines/config/test/ddppo_imagenav_test.yaml b/habitat-lab-dialog/habitat_baselines/config/test/ddppo_imagenav_test.yaml new file mode 100644 index 0000000..bc322a6 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/test/ddppo_imagenav_test.yaml @@ -0,0 +1,67 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/imagenav.yaml" +TRAINER_NAME: "ddppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "" +VIDEO_DIR: "" +SENSORS: ["RGB_SENSOR", "DEPTH_SENSOR"] +EVAL_CKPT_PATH_DIR: "data/test_checkpoints/ddppo/imagenav/ckpt.0.pth" +NUM_ENVIRONMENTS: 1 +CHECKPOINT_FOLDER: "data/test_checkpoints/ddppo/imagenav/" +NUM_UPDATES: 2 +LOG_INTERVAL: 100 +NUM_CHECKPOINTS: 2 +TEST_EPISODE_COUNT: 2 + + +RL: + SUCCESS_REWARD: 2.5 + + POLICY: + name: "PointNavResNetPolicy" + + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + num_steps: 16 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + reward_window_size: 50 + + use_normalized_advantage: False + + hidden_size: 512 + + DDPPO: + sync_frac: 0.6 + # The PyTorch distributed backend to use + distrib_backend: GLOO + # Visual encoder backbone + pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth + # Initialize with pretrained weights + pretrained: False + # Initialize just the visual encoder backbone with pretrained weights + pretrained_encoder: False + # Whether or not the visual encoder backbone will be trained. + train_encoder: True + # Whether or not to reset the critic linear layer + reset_critic: True + + # Model parameters + backbone: resnet18 + rnn_type: LSTM + num_recurrent_layers: 2 + + force_distributed: True diff --git a/habitat-lab-dialog/habitat_baselines/config/test/ddppo_pointnav_test.yaml b/habitat-lab-dialog/habitat_baselines/config/test/ddppo_pointnav_test.yaml new file mode 100644 index 0000000..4e6f37b --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/test/ddppo_pointnav_test.yaml @@ -0,0 +1,64 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav.yaml" +TRAINER_NAME: "ddppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "" +EVAL_CKPT_PATH_DIR: "data/test_checkpoints/ddppo/pointnav/ckpt.0.pth" +NUM_ENVIRONMENTS: 1 +CHECKPOINT_FOLDER: "data/test_checkpoints/ddppo/pointnav/" +NUM_UPDATES: 2 +NUM_CHECKPOINTS: 2 +LOG_INTERVAL: 100 +TEST_EPISODE_COUNT: 2 + +RL: + SUCCESS_REWARD: 2.5 + + POLICY: + name: "PointNavResNetPolicy" + + PPO: + # ppo params + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + num_steps: 16 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + reward_window_size: 50 + + use_normalized_advantage: False + + hidden_size: 512 + + DDPPO: + sync_frac: 0.6 + # The PyTorch distributed backend to use + distrib_backend: GLOO + # Visual encoder backbone + pretrained_weights: data/ddppo-models/gibson-2plus-resnet50.pth + # Initialize with pretrained weights + pretrained: False + # Initialize just the visual encoder backbone with pretrained weights + pretrained_encoder: False + # Whether or not the visual encoder backbone will be trained. + train_encoder: True + # Whether or not to reset the critic linear layer + reset_critic: True + + # Model parameters + backbone: resnet18 + rnn_type: LSTM + num_recurrent_layers: 2 + + force_distributed: True diff --git a/habitat-lab-dialog/habitat_baselines/config/test/ppo_imagenav_test.yaml b/habitat-lab-dialog/habitat_baselines/config/test/ppo_imagenav_test.yaml new file mode 100644 index 0000000..8bd608b --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/test/ppo_imagenav_test.yaml @@ -0,0 +1,34 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/imagenav.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: [] +TENSORBOARD_DIR: "" +EVAL_CKPT_PATH_DIR: "data/test_checkpoints/ppo/imagenav/ckpt.0.pth" +NUM_ENVIRONMENTS: 1 +CHECKPOINT_FOLDER: "data/test_checkpoints/ppo/imagenav/" +NUM_UPDATES: 2 +LOG_INTERVAL: 100 +NUM_CHECKPOINTS: 2 +TEST_EPISODE_COUNT: 2 + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + num_steps: 16 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + reward_window_size: 50 diff --git a/habitat-lab-dialog/habitat_baselines/config/test/ppo_pointnav_test.yaml b/habitat-lab-dialog/habitat_baselines/config/test/ppo_pointnav_test.yaml new file mode 100644 index 0000000..796dda6 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/config/test/ppo_pointnav_test.yaml @@ -0,0 +1,35 @@ +BASE_TASK_CONFIG_PATH: "configs/tasks/pointnav.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "NavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +VIDEO_OPTION: ["DISK"] +VIDEO_DIR: "data/test_checkpoints/ppo/pointnav/video" +TENSORBOARD_DIR: "" +EVAL_CKPT_PATH_DIR: "data/test_checkpoints/ppo/pointnav/ckpt.0.pth" +NUM_ENVIRONMENTS: 1 +CHECKPOINT_FOLDER: "data/test_checkpoints/ppo/pointnav/" +NUM_UPDATES: 2 +LOG_INTERVAL: 100 +NUM_CHECKPOINTS: 2 +TEST_EPISODE_COUNT: 2 + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.01 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + num_steps: 16 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + reward_window_size: 50 diff --git a/habitat-lab-dialog/habitat_baselines/il/README.md b/habitat-lab-dialog/habitat_baselines/il/README.md new file mode 100644 index 0000000..9523736 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/il/README.md @@ -0,0 +1,141 @@ +Imitation Learning (IL) +======================= + +## Embodied Question Answering + +**based on EmbodiedQA (Das et al. CVPR 2018) implementation.** + +**Paper:** https://embodiedqa.org/paper.pdf + +**Code:** https://github.com/facebookresearch/EmbodiedQA + +The implementation consists of first independently training the - +- **[EQA-CNN-Pretrain](#eqa-cnn-pretrain-model)** for feature extraction +- **VQA model** (for predicting answer based on question and image input) +- **PACMAN (NAV) model** (for navigating to the required destination based on question and image input) + +followed by fine-tuning the NAV model. + +> "We employ a two-stage training process. First, the navigation and answering modules are independently trained using imitation/supervised learning on automatically generated expert demonstrations of navigation. Second, the navigation architecture is fine-tuned .." + +## Pre-requisites: + +- Habitat-sim and Habitat-api installation. +- Download the Matterport 3D **scene dataset** and **task dataset** and place them in the appropriate folders (relevant information in repository's [README](https://github.com/facebookresearch/habitat-api/blob/master/README.md)). + +--- + +## EQA-CNN-Pretrain model + +### Information: +This is an encoder-decoder network that takes RGB input and generates an RGB reconstruction, a depth map and a a Segmentation map. The encoder from this network is extracted and used as a feature extractor for subsequent VQA and NAV trainers. + +(more information about network in Appendix B of [EQA paper](https://embodiedqa.org/paper.pdf)). + +### Configuration: + +Configuration for training the VQA (answering) model can be found in `habitat_baselines/config/eqa/il_eqa_cnn_pretrain.yaml`. + +### Train: + +``` + python -u habitat_baselines/run.py --exp-config habitat_baselines/config/eqa/il_eqa_cnn_pretrain.yaml --run-type train +``` + +Training checkpoints are by default stored in `data/eqa/eqa_cnn_pretrain/checkpoints`. + +### Eval: + +``` + python -u habitat_baselines/run.py --exp-config habitat_baselines/config/eqa/il_eqa_cnn_pretrain.yaml --run-type eval +``` + +Results from evaluation are stored in `data/eqa/eqa_cnn_pretrain/results/val`. + + +### Pre-trained model + +Pre-trained EQA-CNN-Pretrain model can be downloaded from [here](https://drive.google.com/drive/folders/1yO8Pnyt-oxqAz0ozxwyI3OcaFRiKZKgd?usp=sharing). + +After downloading the pre-trained model, it's path needs to be added to the config file's `EVAL_CKPT_PATH_DIR` parameter for evaluation. + +### Example results: + + + +--- + +## Visual Question Answering (VQA) model- + +### Information: +The VQA model is responsible for predicting an answer based on the input question and a series of RGB images. The network first encodes images from the scene using the pre-trained EQA-CNN encoder mentioned above. + +(more information about network can be found in the [paper](https://embodiedqa.org/paper.pdf)). + +### Configuration: + +Configuration for training the VQA (answering) model can be found in `habitat_baselines/config/eqa/il_vqa.yaml`. + +The VQA trainer picks the EQA CNN pre-trained encoder checkpoint by default from `data/eqa/eqa_cnn_pretrain/checkpoints/epoch_5.ckpt`. If you want to use a different checkpoint for the EQA CNN encoder, the corresponding path can be changed in the aforementioned config file's `EQA_CNN_PRETRAIN_CKPT_PATH` parameter. + +### Train: + +``` + python -u habitat_baselines/run.py --exp-config habitat_baselines/config/eqa/il_vqa.yaml --run-type train +``` + +Training checkpoints are by default stored in `data/eqa/vqa/checkpoints`. + +### Pre-trained model + +Pre-trained VQA model can be downloaded from [here](https://www.dropbox.com/s/5e4srcc5odl4rbo/pretrained_vqa.ckpt?dl=0). + +After downloading the pre-trained model, add it's path to the config file's `EVAL_CKPT_PATH_DIR` parameter for evaluation. + +### Eval: + +``` + python -u habitat_baselines/run.py --exp-config habitat_baselines/config/eqa/il_vqa.yaml --run-type eval +``` + +Results from evaluation are stored in `data/eqa/vqa/results/val`. + +### Example results: + +![](https://user-images.githubusercontent.com/24846546/75141155-464bde00-56e8-11ea-9f2e-ca346440e1d2.jpg) +![](https://user-images.githubusercontent.com/24846546/75141287-8e6b0080-56e8-11ea-8045-b4c4521954b2.jpg) + +---- + +## NAV model (PACMAN) + +### Information: +The NAV model (known as *PACMAN*) predicts the actions required to navigate the environment to the required destination based on question and RGB scene input. + +(more information about network can be found in the [paper](https://embodiedqa.org/paper.pdf)). + +### Configuration: + +Configuration for training the NAV-PACMAN model can be found in `habitat_baselines/config/eqa/il_pacman_nav.yaml`. +The trainer also picks the EQA CNN pre-trained encoder checkpoint by default from `data/eqa/eqa_cnn_pretrain/checkpoints/epoch_5.ckpt`. + +### Train: + +``` + python -u habitat_baselines/run.py --exp-config habitat_baselines/config/eqa/il_pacman_nav.yaml --run-type train +``` + +Training checkpoints are by default stored in `data/eqa/nav/checkpoints`. + + +### Eval: + +``` + python -u habitat_baselines/run.py --exp-config habitat_baselines/config/eqa/il_pacman_nav.yaml --run-type eval +``` + +Results from evaluation are stored in `data/eqa/nav/results/val`. + +### Example results: + +![](https://user-images.githubusercontent.com/24846546/78616220-2d942380-7863-11ea-9092-34a760352555.gif) ![](https://user-images.githubusercontent.com/24846546/78616221-2ec55080-7863-11ea-987b-2fdc2a802f24.gif) ![](https://user-images.githubusercontent.com/24846546/78616897-2cfc8c80-7865-11ea-8a4c-0afdfefea49c.gif) diff --git a/habitat-lab-dialog/habitat_baselines/il/metrics.py b/habitat-lab-dialog/habitat_baselines/il/metrics.py new file mode 100644 index 0000000..1dda5ce --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/il/metrics.py @@ -0,0 +1,120 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import json +from typing import List, Tuple + +import numpy as np +import torch + + +class Metric: + def __init__(self, info=None, metric_names=None, log_json=None): + self.info = info + self.metric_names = sorted(metric_names) if metric_names else [] + + self.metrics = [[None, None, None] for _ in self.metric_names] + + self.stats = [] + self.num_iters = 0 + + self.log_json = log_json + + def update(self, values: List) -> None: + assert isinstance(values, list) + + self.num_iters += 1 + current_stats = [] + + for i in range(len(values)): + if values[i] is None: + continue + + if isinstance(values[i], list) is False: + values[i] = [values[i]] + + if self.metrics[i][0] is None: + self.metrics[i][0] = np.mean(values[i]) + self.metrics[i][1] = np.mean(values[i]) + self.metrics[i][2] = np.mean(values[i]) + else: + self.metrics[i][0] = ( + self.metrics[i][0] * (self.num_iters - 1) + + np.mean(values[i]) + ) / self.num_iters + + self.metrics[i][1] = 0.95 * self.metrics[i][ + 1 + ] + 0.05 * np.mean(values[i]) + + self.metrics[i][2] = np.mean(values[i]) + + self.metrics[i][0] = float(self.metrics[i][0]) + self.metrics[i][1] = float(self.metrics[i][1]) + self.metrics[i][2] = float(self.metrics[i][2]) + + current_stats.append(self.metrics[i]) + + self.stats.append(copy.deepcopy(current_stats)) + + def get_stat_string(self, mode: int = 1) -> str: + + stat_string = "" + + for k, v in self.info.items(): + stat_string += "[{}:{}]".format(k, v) + + stat_string += "[iters:{}]\n".format(self.num_iters) + for i in range(len(self.metric_names)): + if self.metrics[i][mode] is not None: + stat_string += "[{}:{:.3f}]".format( + self.metric_names[i], + self.metrics[i][mode], + ) + + return stat_string + + def get_stats(self, mode: int = 1) -> List[float]: + stats = [] + for i in range(len(self.metric_names)): + stats.append(self.metrics[i][mode]) + + return stats + + def dump_log(self) -> bool: + + if self.log_json is None: + return False + + dict_to_save = {"metric_names": self.metric_names, "stats": self.stats} + + with open(self.log_json, "w") as f: + json.dump(dict_to_save, f) + + return True + + +class VqaMetric(Metric): + def __init__(self, info=None, metric_names=None, log_json=None): + super().__init__(info, metric_names, log_json) + + def compute_ranks( + self, scores: torch.Tensor, labels: torch.Tensor + ) -> Tuple[np.ndarray, np.ndarray]: + accuracy = np.zeros(len(labels)) + ranks = np.full(len(labels), scores.shape[1]) + + for i in range(scores.shape[0]): + ranks[i] = scores[i].gt(scores[i][labels[i]]).sum() + 1 + if ranks[i] == 1: + accuracy[i] = 1 + return accuracy, ranks + + +class NavMetric(Metric): + def __init__(self, info=None, metric_names=None, log_json=None): + super().__init__(info, metric_names, log_json) diff --git a/habitat-lab-dialog/habitat_baselines/il/models/models.py b/habitat-lab-dialog/habitat_baselines/il/models/models.py new file mode 100644 index 0000000..76d5ba3 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/il/models/models.py @@ -0,0 +1,723 @@ +import math +from typing import Dict, Iterable, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + +from habitat import logger + + +def build_mlp( + input_dim: int, + hidden_dims: Iterable[int], + output_dim: int, + use_batchnorm: bool = False, + dropout: float = 0, + add_sigmoid: bool = True, +): + layers = [] + D = input_dim + if dropout > 0: + layers.append(nn.Dropout(p=dropout)) + if use_batchnorm: + layers.append(nn.BatchNorm1d(input_dim)) + for dim in hidden_dims: + layers.append(nn.Linear(D, dim)) + if use_batchnorm: + layers.append(nn.BatchNorm1d(dim)) + if dropout > 0: + layers.append(nn.Dropout(p=dropout)) + layers.append(nn.ReLU(inplace=True)) + D = dim + layers.append(nn.Linear(D, output_dim)) + + if add_sigmoid: + layers.append(nn.Sigmoid()) + return nn.Sequential(*layers) + + +class MultitaskCNN(nn.Module): + def __init__( + self, + num_classes: int = 41, + only_encoder: bool = False, + pretrained: bool = True, + checkpoint_path: str = "data/eqa/eqa_cnn_pretrain/checkpoints/epoch_5.ckpt", + freeze_encoder: bool = False, + ) -> None: + super(MultitaskCNN, self).__init__() + + self.num_classes = num_classes + self.only_encoder = only_encoder + + self.conv_block1 = nn.Sequential( + nn.Conv2d(3, 8, 5), + nn.BatchNorm2d(8), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + ) + self.conv_block2 = nn.Sequential( + nn.Conv2d(8, 16, 5), + nn.BatchNorm2d(16), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + ) + self.conv_block3 = nn.Sequential( + nn.Conv2d(16, 32, 5), + nn.BatchNorm2d(32), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + ) + self.conv_block4 = nn.Sequential( + nn.Conv2d(32, 32, 5), + nn.BatchNorm2d(32), + nn.ReLU(inplace=True), + nn.MaxPool2d(2, 2), + ) + self.classifier = nn.Sequential( + nn.Conv2d(32, 512, 5), + nn.BatchNorm2d(512), + nn.ReLU(inplace=True), + nn.Dropout2d(), + nn.Conv2d(512, 512, 1), + nn.BatchNorm2d(512), + nn.ReLU(inplace=True), + nn.Dropout2d(), + ) + + self.encoder_seg = nn.Conv2d(512, self.num_classes, 1) + self.encoder_depth = nn.Conv2d(512, 1, 1) + self.encoder_ae = nn.Conv2d(512, 3, 1) + + self.score_pool2_seg = nn.Conv2d(16, self.num_classes, 1) + self.score_pool3_seg = nn.Conv2d(32, self.num_classes, 1) + + self.score_pool2_depth = nn.Conv2d(16, 1, 1) + self.score_pool3_depth = nn.Conv2d(32, 1, 1) + + self.score_pool2_ae = nn.Conv2d(16, 3, 1) + self.score_pool3_ae = nn.Conv2d(32, 3, 1) + + if self.only_encoder: + if pretrained: + logger.info( + "Loading CNN weights from {}".format(checkpoint_path) + ) + checkpoint = torch.load( + checkpoint_path, map_location={"cuda:0": "cpu"} + ) + self.load_state_dict(checkpoint) + + if freeze_encoder: + for param in self.parameters(): + param.requires_grad = False + else: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = ( + m.kernel_size[0] + * m.kernel_size[1] + * (m.out_channels + m.in_channels) + ) + m.weight.data.normal_(0, math.sqrt(2.0 / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def forward(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor]: + conv1 = self.conv_block1(x) + conv2 = self.conv_block2(conv1) + conv3 = self.conv_block3(conv2) + conv4 = self.conv_block4(conv3) + + if self.only_encoder: + return conv4.view(-1, 32 * 12 * 12) + + encoder_output = self.classifier(conv4) + + encoder_output_seg = self.encoder_seg(encoder_output) + encoder_output_depth = self.encoder_depth(encoder_output) + encoder_output_ae = self.encoder_ae(encoder_output) + + score_pool2_seg = self.score_pool2_seg(conv2) + score_pool3_seg = self.score_pool3_seg(conv3) + + score_pool2_depth = self.score_pool2_depth(conv2) + score_pool3_depth = self.score_pool3_depth(conv3) + + score_pool2_ae = self.score_pool2_ae(conv2) + score_pool3_ae = self.score_pool3_ae(conv3) + + score_seg = F.interpolate( + encoder_output_seg, + score_pool3_seg.size()[2:], + mode="bilinear", + align_corners=True, + ) + score_seg += score_pool3_seg + score_seg = F.interpolate( + score_seg, + score_pool2_seg.size()[2:], + mode="bilinear", + align_corners=True, + ) + score_seg += score_pool2_seg + out_seg = F.interpolate( + score_seg, x.size()[2:], mode="bilinear", align_corners=True + ) + + score_depth = F.interpolate( + encoder_output_depth, + score_pool3_depth.size()[2:], + mode="bilinear", + align_corners=True, + ) + score_depth += score_pool3_depth + score_depth = F.interpolate( + score_depth, + score_pool2_depth.size()[2:], + mode="bilinear", + align_corners=True, + ) + score_depth += score_pool2_depth + out_depth = torch.sigmoid( + F.interpolate( + score_depth, x.size()[2:], mode="bilinear", align_corners=True + ) + ) + + score_ae = F.interpolate( + encoder_output_ae, + score_pool3_ae.size()[2:], + mode="bilinear", + align_corners=True, + ) + score_ae += score_pool3_ae + score_ae = F.interpolate( + score_ae, + score_pool2_ae.size()[2:], + mode="bilinear", + align_corners=True, + ) + score_ae += score_pool2_ae + out_ae = torch.sigmoid( + F.interpolate( + score_ae, x.size()[2:], mode="bilinear", align_corners=True + ) + ) + + return out_seg, out_depth, out_ae + + +class QuestionLstmEncoder(nn.Module): + def __init__( + self, + token_to_idx: Dict, + wordvec_dim: int = 64, + rnn_dim: int = 64, + rnn_num_layers: int = 2, + rnn_dropout: float = 0, + ) -> None: + super(QuestionLstmEncoder, self).__init__() + + self.token_to_idx = token_to_idx + self.NULL = token_to_idx[""] + self.START = token_to_idx[""] + self.END = token_to_idx[""] + + self.embed = nn.Embedding(len(token_to_idx), wordvec_dim) + self.rnn = nn.LSTM( + wordvec_dim, + rnn_dim, + rnn_num_layers, + dropout=rnn_dropout, + batch_first=True, + ) + + self.init_weights() + + def init_weights(self) -> None: + initrange = 0.1 + self.embed.weight.data.uniform_(-initrange, initrange) + + def forward(self, x: Tensor) -> Tensor: + N, T = x.size() + idx = torch.LongTensor(N).fill_(T - 1) + + # Find the last non-null element in each sequence + idx = (x != self.NULL).long().sum(-1) - 1 + idx = idx.type_as(x.data).long() + idx.requires_grad = False + + hs, _ = self.rnn(self.embed(x.long())) + + idx = idx.view(N, 1, 1).expand(N, 1, hs.size(2)) + H = hs.size(2) + return hs.gather(1, idx).view(N, H) + + +class VqaLstmCnnAttentionModel(nn.Module): + def __init__( + self, + q_vocab: Dict, + ans_vocab: Dict, + eqa_cnn_pretrain_ckpt_path: str, + freeze_encoder: bool = False, + image_feat_dim: int = 64, + question_wordvec_dim: int = 64, + question_hidden_dim: int = 64, + question_num_layers: int = 2, + question_dropout: float = 0.5, + fc_use_batchnorm: bool = False, + fc_dropout: float = 0.5, + fc_dims: Iterable[int] = (64,), + ) -> None: + super(VqaLstmCnnAttentionModel, self).__init__() + + cnn_kwargs = { + "num_classes": 41, + "only_encoder": True, + "pretrained": True, + "checkpoint_path": eqa_cnn_pretrain_ckpt_path, + "freeze_encoder": freeze_encoder, + } + self.cnn = MultitaskCNN(**cnn_kwargs) + self.cnn_fc_layer = nn.Sequential( + nn.Linear(32 * 12 * 12, 64), nn.ReLU(), nn.Dropout(p=0.5) + ) + + q_rnn_kwargs = { + "token_to_idx": q_vocab, + "wordvec_dim": question_wordvec_dim, + "rnn_dim": question_hidden_dim, + "rnn_num_layers": question_num_layers, + "rnn_dropout": question_dropout, + } + self.q_rnn = QuestionLstmEncoder(**q_rnn_kwargs) + + self.img_tr = nn.Sequential(nn.Linear(64, 64), nn.Dropout(p=0.5)) + + self.ques_tr = nn.Sequential(nn.Linear(64, 64), nn.Dropout(p=0.5)) + + classifier_kwargs = { + "input_dim": 64, + "hidden_dims": fc_dims, + "output_dim": len(ans_vocab), + "use_batchnorm": True, + "dropout": fc_dropout, + "add_sigmoid": False, + } + self.classifier = build_mlp(**classifier_kwargs) + + self.att = nn.Sequential( + nn.Tanh(), nn.Dropout(p=0.5), nn.Linear(128, 1) + ) + + def forward( + self, images: Tensor, questions: Tensor + ) -> Tuple[Tensor, Tensor]: + + N, T, _, _, _ = images.size() + # bs x 5 x 3 x 256 x 256 + img_feats = self.cnn( + images.contiguous().view( + -1, images.size(2), images.size(3), images.size(4) + ) + ) + + img_feats = self.cnn_fc_layer(img_feats) + + img_feats_tr = self.img_tr(img_feats) + ques_feats = self.q_rnn(questions) + + ques_feats_repl = ques_feats.view(N, 1, -1).repeat(1, T, 1) + ques_feats_repl = ques_feats_repl.view(N * T, -1) + + ques_feats_tr = self.ques_tr(ques_feats_repl) + + ques_img_feats = torch.cat([ques_feats_tr, img_feats_tr], 1) + + att_feats = self.att(ques_img_feats) + att_probs = F.softmax(att_feats.view(N, T), dim=1) + att_probs2 = att_probs.view(N, T, 1).repeat(1, 1, 64) + + att_img_feats = torch.mul(att_probs2, img_feats.view(N, T, 64)) + att_img_feats = torch.sum(att_img_feats, dim=1) + + mul_feats = torch.mul(ques_feats, att_img_feats) + + scores = self.classifier(mul_feats) + + return scores, att_probs + + +class MaskedNLLCriterion(nn.Module): + def __init__(self) -> None: + super(MaskedNLLCriterion, self).__init__() + + def forward(self, inp: Tensor, target: Tensor, mask: Tensor) -> Tensor: + logprob_select = torch.gather(inp, 1, target.long()) + out = torch.masked_select(logprob_select, mask) + loss = -torch.sum(out) / mask.float().sum() + return loss + + +class NavPlannerControllerModel(nn.Module): + def __init__( + self, + q_vocab: Dict, + num_output: int = 4, + question_wordvec_dim: int = 64, + question_hidden_dim: int = 64, + question_num_layers: int = 2, + question_dropout: float = 0.5, + planner_rnn_image_feat_dim: int = 128, + planner_rnn_action_embed_dim: int = 32, + planner_rnn_type: str = "GRU", + planner_rnn_hidden_dim: int = 1024, + planner_rnn_num_layers: int = 1, + planner_rnn_dropout: float = 0, + controller_fc_dims: Iterable[int] = (256,), + ) -> None: + super(NavPlannerControllerModel, self).__init__() + + self.cnn_fc_layer = nn.Sequential( + nn.Linear(32 * 12 * 12, planner_rnn_image_feat_dim), + nn.ReLU(), + nn.Dropout(p=0.5), + ) + + q_rnn_kwargs = { + "token_to_idx": q_vocab, + "wordvec_dim": question_wordvec_dim, + "rnn_dim": question_hidden_dim, + "rnn_num_layers": question_num_layers, + "rnn_dropout": question_dropout, + } + self.q_rnn = QuestionLstmEncoder(**q_rnn_kwargs) + self.ques_tr = nn.Sequential( + nn.Linear(question_hidden_dim, question_hidden_dim), + nn.ReLU(), + nn.Dropout(p=0.5), + ) + + self.planner_nav_rnn = NavRnn( + image_input=True, + image_feat_dim=planner_rnn_image_feat_dim, + question_input=True, + question_embed_dim=question_hidden_dim, + action_input=True, + action_embed_dim=planner_rnn_action_embed_dim, + num_actions=num_output, + rnn_type=planner_rnn_type, + rnn_hidden_dim=planner_rnn_hidden_dim, + rnn_num_layers=planner_rnn_num_layers, + rnn_dropout=planner_rnn_dropout, + return_states=True, + ) + + controller_kwargs = { + "input_dim": planner_rnn_image_feat_dim + + planner_rnn_action_embed_dim + + planner_rnn_hidden_dim, + "hidden_dims": controller_fc_dims, + "output_dim": 2, + "add_sigmoid": 0, + } + self.controller = build_mlp(**controller_kwargs) + + def forward( + self, + questions: Tensor, + planner_img_feats: Tensor, + planner_actions_in: Tensor, + planner_action_lengths: Tensor, + planner_hidden_index: Tensor, + controller_img_feats: Tensor, + controller_actions_in: Tensor, + controller_action_lengths: Tensor, + ) -> Tuple[Tensor, Tensor, Tensor]: + + N_p, T_p, _ = planner_img_feats.size() + + planner_img_feats = self.cnn_fc_layer(planner_img_feats) + controller_img_feats = self.cnn_fc_layer(controller_img_feats) + + ques_feats = self.q_rnn(questions) + ques_feats = self.ques_tr(ques_feats) + + planner_states, planner_scores, planner_hidden = self.planner_nav_rnn( + planner_img_feats, + ques_feats, + planner_actions_in, + planner_action_lengths, + ) + + planner_hidden_index = planner_hidden_index[ + :, : controller_action_lengths.max() + ] + controller_img_feats = controller_img_feats[ + :, : controller_action_lengths.max() + ] + controller_actions_in = controller_actions_in[ + :, : controller_action_lengths.max() + ] + + N_c, T_c, _ = controller_img_feats.size() + + assert planner_hidden_index.max().item() <= planner_states.size(1) + + planner_hidden_index = ( + planner_hidden_index.contiguous() + .view(N_p, planner_hidden_index.size(1), 1) + .repeat(1, 1, planner_states.size(2)) + ) + + controller_hidden_in = planner_states.gather( + 1, planner_hidden_index.long() + ) + + controller_hidden_in = controller_hidden_in.view( + N_c * T_c, controller_hidden_in.size(2) + ) + + controller_img_feats = controller_img_feats.contiguous().view( + N_c * T_c, -1 + ) + + controller_actions_embed = self.planner_nav_rnn.action_embed( + controller_actions_in.long() + ).view(N_c * T_c, -1) + + controller_in = torch.cat( + [ + controller_img_feats, + controller_actions_embed, + controller_hidden_in, + ], + 1, + ) + controller_scores = self.controller(controller_in) + return planner_scores, controller_scores, planner_hidden + + def planner_step( + self, + questions: Tensor, + img_feats: Tensor, + actions_in: Tensor, + planner_hidden: Tensor, + ) -> Tuple[Tensor]: + img_feats = self.cnn_fc_layer(img_feats) + ques_feats = self.q_rnn(questions) + ques_feats = self.ques_tr(ques_feats) + planner_scores, planner_hidden = self.planner_nav_rnn.step_forward( + img_feats, ques_feats, actions_in, planner_hidden + ) + + return planner_scores, planner_hidden + + def controller_step( + self, img_feats: Tensor, actions_in: Tensor, hidden_in: Tensor + ) -> Tensor: + + img_feats = self.cnn_fc_layer(img_feats) + actions_embed = self.planner_nav_rnn.action_embed(actions_in) + + img_feats = img_feats.view(1, -1) + actions_embed = actions_embed.view(1, -1) + hidden_in = hidden_in.view(1, -1) + + controller_in = torch.cat([img_feats, actions_embed, hidden_in], 1) + controller_scores = self.controller(controller_in) + + return controller_scores + + +class NavRnn(nn.Module): + def __init__( + self, + image_input: bool = False, + image_feat_dim: int = 128, + question_input: bool = False, + question_embed_dim: int = 128, + action_input: bool = False, + action_embed_dim: int = 32, + num_actions: int = 4, + mode: str = "sl", + rnn_type: str = "LSTM", + rnn_hidden_dim: int = 128, + rnn_num_layers: int = 2, + rnn_dropout: float = 0, + return_states: bool = False, + ) -> None: + super(NavRnn, self).__init__() + + self.image_input = image_input + self.image_feat_dim = image_feat_dim + + self.question_input = question_input + self.question_embed_dim = question_embed_dim + + self.action_input = action_input + self.action_embed_dim = action_embed_dim + + self.num_actions = num_actions + + self.rnn_type = rnn_type + self.rnn_hidden_dim = rnn_hidden_dim + self.rnn_num_layers = rnn_num_layers + + self.return_states = return_states + + rnn_input_dim = 0 + if self.image_input is True: + rnn_input_dim += image_feat_dim + logger.info( + "Adding input to {}: image, rnn dim: {}".format( + self.rnn_type, rnn_input_dim + ) + ) + + if self.question_input is True: + rnn_input_dim += question_embed_dim + logger.info( + "Adding input to {}: question, rnn dim: {}".format( + self.rnn_type, rnn_input_dim + ) + ) + + if self.action_input is True: + self.action_embed = nn.Embedding(num_actions, action_embed_dim) + rnn_input_dim += action_embed_dim + logger.info( + "Adding input to {}: action, rnn dim: {}".format( + self.rnn_type, rnn_input_dim + ) + ) + + self.rnn = getattr(nn, self.rnn_type)( + rnn_input_dim, + self.rnn_hidden_dim, + self.rnn_num_layers, + dropout=rnn_dropout, + batch_first=True, + ) + logger.info( + "Building {} with hidden dim: {}".format( + self.rnn_type, rnn_hidden_dim + ) + ) + + self.decoder = nn.Linear(self.rnn_hidden_dim, self.num_actions) + + def init_hidden(self, bsz: int) -> Union[Tensor, None]: + weight = next(self.parameters()).data + if self.rnn_type == "LSTM": + return ( + weight.new( + self.rnn_num_layers, bsz, self.rnn_hidden_dim + ).zero_(), + weight.new( + self.rnn_num_layers, bsz, self.rnn_hidden_dim + ).zero_(), + ) + elif self.rnn_type == "GRU": + return weight.new( + self.rnn_num_layers, bsz, self.rnn_hidden_dim + ).zero_() + else: + return None + + def forward( + self, + img_feats: Tensor, + question_feats: Tensor, + actions_in: Tensor, + action_lengths: Tensor, + hidden: bool = False, + ) -> Union[Tuple[Tensor, Tensor], Tuple[Tensor, Tensor, Tensor]]: + + T = False + if self.image_input is True: + N, T, _ = img_feats.size() + input_feats = img_feats + + if self.question_input is True: + N, D = question_feats.size() + question_feats = question_feats.view(N, 1, D) + if T is False: + T = actions_in.size(1) + question_feats = question_feats.repeat(1, T, 1) + if len(input_feats) == 0: + input_feats = question_feats + else: + input_feats = torch.cat([input_feats, question_feats], 2) + + if self.action_input is True: + if len(input_feats) == 0: + input_feats = self.action_embed(actions_in) + else: + input_feats = torch.cat( + [input_feats, self.action_embed(actions_in.long())], 2 + ) + + packed_input_feats = pack_padded_sequence( + input_feats, action_lengths, batch_first=True + ) + packed_output, hidden = self.rnn(packed_input_feats) + rnn_output, _ = pad_packed_sequence(packed_output, batch_first=True) + output = self.decoder( + rnn_output.contiguous().view( + rnn_output.size(0) * rnn_output.size(1), rnn_output.size(2) + ) + ) + + if self.return_states: + return rnn_output, output, hidden + else: + return output, hidden + + def step_forward( + self, + img_feats: Tensor, + question_feats: Tensor, + actions_in: Tensor, + hidden: Tensor, + ) -> Tuple[Tensor, Tensor]: + + T = False + if self.image_input is True: + N, T, _ = img_feats.size() + input_feats = img_feats + + if self.question_input is True: + N, D = question_feats.size() + question_feats = question_feats.view(N, 1, D) + if T is False: + T = actions_in.size(1) + question_feats = question_feats.repeat(1, T, 1) + if len(input_feats) == 0: + input_feats = question_feats + else: + input_feats = torch.cat([input_feats, question_feats], 2) + + if self.action_input is True: + if len(input_feats) == 0: + input_feats = self.action_embed(actions_in) + else: + actions_in = actions_in.long() + input_feats = torch.cat( + [input_feats, self.action_embed(actions_in)], 2 + ) + + output, hidden = self.rnn(input_feats, hidden) + output = self.decoder( + output.contiguous().view( + output.size(0) * output.size(1), output.size(2) + ) + ) + + return output, hidden diff --git a/habitat-lab-dialog/habitat_baselines/il/requirements.txt b/habitat-lab-dialog/habitat_baselines/il/requirements.txt new file mode 100644 index 0000000..8d8abe7 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/il/requirements.txt @@ -0,0 +1,2 @@ +lmdb>=0.98 +webdataset==0.1.40 diff --git a/habitat-lab-dialog/habitat_baselines/il/trainers/eqa_cnn_pretrain_trainer.py b/habitat-lab-dialog/habitat_baselines/il/trainers/eqa_cnn_pretrain_trainer.py new file mode 100644 index 0000000..0eb8bbb --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/il/trainers/eqa_cnn_pretrain_trainer.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import time + +import torch +from torch.utils.data import DataLoader + +from habitat import logger +from habitat_baselines.common.base_il_trainer import BaseILTrainer +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.common.tensorboard_utils import TensorboardWriter +from habitat_baselines.il.data.eqa_cnn_pretrain_data import ( + EQACNNPretrainDataset, +) +from habitat_baselines.il.models.models import MultitaskCNN +from habitat_baselines.utils.visualizations.utils import ( + save_depth_results, + save_rgb_results, + save_seg_results, +) + + +@baseline_registry.register_trainer(name="eqa-cnn-pretrain") +class EQACNNPretrainTrainer(BaseILTrainer): + r"""Trainer class for Encoder-Decoder for Feature Extraction + used in EmbodiedQA (Das et. al.;CVPR 2018) + Paper: https://embodiedqa.org/paper.pdf. + """ + supported_tasks = ["EQA-v0"] + + def __init__(self, config=None): + super().__init__(config) + + self.device = ( + torch.device("cuda", self.config.TORCH_GPU_ID) + if torch.cuda.is_available() + else torch.device("cpu") + ) + + if config is not None: + logger.info(f"config: {config}") + + def _make_results_dir(self): + r"""Makes directory for saving eqa-cnn-pretrain eval results.""" + for s_type in ["rgb", "seg", "depth"]: + dir_name = self.config.RESULTS_DIR.format(split="val", type=s_type) + if not os.path.isdir(dir_name): + os.makedirs(dir_name) + + def _save_results( + self, + gt_rgb: torch.Tensor, + pred_rgb: torch.Tensor, + gt_seg: torch.Tensor, + pred_seg: torch.Tensor, + gt_depth: torch.Tensor, + pred_depth: torch.Tensor, + path: str, + ) -> None: + r"""For saving EQA-CNN-Pretrain reconstruction results. + + Args: + gt_rgb: rgb ground truth + preg_rgb: autoencoder output rgb reconstruction + gt_seg: segmentation ground truth + pred_seg: segmentation output + gt_depth: depth map ground truth + pred_depth: depth map output + path: to write file + """ + + save_rgb_results(gt_rgb[0], pred_rgb[0], path) + save_seg_results(gt_seg[0], pred_seg[0], path) + save_depth_results(gt_depth[0], pred_depth[0], path) + + def train(self) -> None: + r"""Main method for pre-training Encoder-Decoder Feature Extractor for EQA. + + Returns: + None + """ + config = self.config + + eqa_cnn_pretrain_dataset = EQACNNPretrainDataset(config) + + train_loader = DataLoader( + eqa_cnn_pretrain_dataset, + batch_size=config.IL.EQACNNPretrain.batch_size, + shuffle=True, + ) + + logger.info( + "[ train_loader has {} samples ]".format( + len(eqa_cnn_pretrain_dataset) + ) + ) + + model = MultitaskCNN() + model.train().to(self.device) + + optim = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=float(config.IL.EQACNNPretrain.lr), + ) + + depth_loss = torch.nn.SmoothL1Loss() + ae_loss = torch.nn.SmoothL1Loss() + seg_loss = torch.nn.CrossEntropyLoss() + + epoch, t = 1, 0 + with TensorboardWriter( + config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) as writer: + while epoch <= config.IL.EQACNNPretrain.max_epochs: + start_time = time.time() + avg_loss = 0.0 + + for batch in train_loader: + t += 1 + + idx, gt_rgb, gt_depth, gt_seg = batch + + optim.zero_grad() + + gt_rgb = gt_rgb.to(self.device) + gt_depth = gt_depth.to(self.device) + gt_seg = gt_seg.to(self.device) + + pred_seg, pred_depth, pred_rgb = model(gt_rgb) + + l1 = seg_loss(pred_seg, gt_seg.long()) + l2 = ae_loss(pred_rgb, gt_rgb) + l3 = depth_loss(pred_depth, gt_depth) + + loss = l1 + (10 * l2) + (10 * l3) + + avg_loss += loss.item() + + if t % config.LOG_INTERVAL == 0: + logger.info( + "[ Epoch: {}; iter: {}; loss: {:.3f} ]".format( + epoch, t, loss.item() + ) + ) + + writer.add_scalar("total_loss", loss, t) + writer.add_scalars( + "individual_losses", + {"seg_loss": l1, "ae_loss": l2, "depth_loss": l3}, + t, + ) + + loss.backward() + optim.step() + + end_time = time.time() + time_taken = "{:.1f}".format((end_time - start_time) / 60) + avg_loss = avg_loss / len(train_loader) + + logger.info( + "[ Epoch {} completed. Time taken: {} minutes. ]".format( + epoch, time_taken + ) + ) + logger.info("[ Average loss: {:.3f} ]".format(avg_loss)) + + print("-----------------------------------------") + + self.save_checkpoint( + model.state_dict(), "epoch_{}.ckpt".format(epoch) + ) + + epoch += 1 + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + r"""Evaluates a single checkpoint. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + config = self.config + + config.defrost() + config.TASK_CONFIG.DATASET.SPLIT = self.config.EVAL.SPLIT + config.freeze() + + eqa_cnn_pretrain_dataset = EQACNNPretrainDataset(config, mode="val") + + eval_loader = DataLoader( + eqa_cnn_pretrain_dataset, + batch_size=config.IL.EQACNNPretrain.batch_size, + shuffle=False, + ) + + logger.info( + "[ eval_loader has {} samples ]".format( + len(eqa_cnn_pretrain_dataset) + ) + ) + + model = MultitaskCNN() + + state_dict = torch.load(checkpoint_path) + model.load_state_dict(state_dict) + + model.to(self.device).eval() + + depth_loss = torch.nn.SmoothL1Loss() + ae_loss = torch.nn.SmoothL1Loss() + seg_loss = torch.nn.CrossEntropyLoss() + + t = 0 + avg_loss = 0.0 + avg_l1 = 0.0 + avg_l2 = 0.0 + avg_l3 = 0.0 + + with torch.no_grad(): + for batch in eval_loader: + t += 1 + + idx, gt_rgb, gt_depth, gt_seg = batch + gt_rgb = gt_rgb.to(self.device) + gt_depth = gt_depth.to(self.device) + gt_seg = gt_seg.to(self.device) + + pred_seg, pred_depth, pred_rgb = model(gt_rgb) + l1 = seg_loss(pred_seg, gt_seg.long()) + l2 = ae_loss(pred_rgb, gt_rgb) + l3 = depth_loss(pred_depth, gt_depth) + + loss = l1 + (10 * l2) + (10 * l3) + + avg_loss += loss.item() + avg_l1 += l1.item() + avg_l2 += l2.item() + avg_l3 += l3.item() + + if t % config.LOG_INTERVAL == 0: + logger.info( + "[ Iter: {}; loss: {:.3f} ]".format(t, loss.item()), + ) + + if ( + config.EVAL_SAVE_RESULTS + and t % config.EVAL_SAVE_RESULTS_INTERVAL == 0 + ): + + result_id = "ckpt_{}_{}".format( + checkpoint_index, idx[0].item() + ) + result_path = os.path.join( + self.config.RESULTS_DIR, result_id + ) + + self._save_results( + gt_rgb, + pred_rgb, + gt_seg, + pred_seg, + gt_depth, + pred_depth, + result_path, + ) + + avg_loss /= len(eval_loader) + avg_l1 /= len(eval_loader) + avg_l2 /= len(eval_loader) + avg_l3 /= len(eval_loader) + + writer.add_scalar("avg val total loss", avg_loss, checkpoint_index) + writer.add_scalars( + "avg val individual_losses", + {"seg_loss": avg_l1, "ae_loss": avg_l2, "depth_loss": avg_l3}, + checkpoint_index, + ) + + logger.info("[ Average loss: {:.3f} ]".format(avg_loss)) + logger.info("[ Average seg loss: {:.3f} ]".format(avg_l1)) + logger.info("[ Average autoencoder loss: {:.4f} ]".format(avg_l2)) + logger.info("[ Average depthloss: {:.4f} ]".format(avg_l3)) diff --git a/habitat-lab-dialog/habitat_baselines/il/trainers/pacman_trainer.py b/habitat-lab-dialog/habitat_baselines/il/trainers/pacman_trainer.py new file mode 100644 index 0000000..a0b40c2 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/il/trainers/pacman_trainer.py @@ -0,0 +1,660 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import os +import time +from datetime import datetime +from typing import Dict, List + +import numpy as np +import torch +import torch.nn.functional as F +from torch.utils.data import DataLoader + +import habitat +from habitat import logger +from habitat.core.utils import try_cv2_import +from habitat_baselines.common.base_il_trainer import BaseILTrainer +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.common.tensorboard_utils import TensorboardWriter +from habitat_baselines.il.data.nav_data import NavDataset +from habitat_baselines.il.metrics import NavMetric +from habitat_baselines.il.models.models import ( + MaskedNLLCriterion, + NavPlannerControllerModel, +) +from habitat_baselines.utils.common import generate_video + +cv2 = try_cv2_import() + + +@baseline_registry.register_trainer(name="pacman") +class PACMANTrainer(BaseILTrainer): + r"""Trainer class for PACMAN (Planner and Controller Module) Nav model + used in EmbodiedQA (Das et. al.;CVPR 2018) + Paper: https://embodiedqa.org/paper.pdf. + """ + supported_tasks = ["EQA-v0"] + + def __init__(self, config=None): + super().__init__(config) + + self.device = ( + torch.device("cuda", self.config.TORCH_GPU_ID) + if torch.cuda.is_available() + else torch.device("cpu") + ) + + if config is not None: + logger.info(f"config: {config}") + + def _save_nav_results( + self, + ckpt_path: int, + ep_id: int, + questions: torch.Tensor, + imgs: List[np.ndarray], + q_vocab_dict: Dict, + results_dir: str, + writer: TensorboardWriter, + video_option: list, + ) -> None: + + r"""For saving NAV-PACMAN eval results. + Args: + ckpt_path: path of checkpoint being evaluated + ep_id: episode id (batch index) + questions: input question to model + imgs: images' tensor containing input frames + q_vocab_dict: question vocab dictionary + results_dir: dir to save results + writer: tensorboard writer + video_option: ["disk", "tb"] + Returns: + None + """ + + question = questions[0] + + ckpt_epoch = ckpt_path[ckpt_path.rfind("/") + 1 : -5] + results_dir = os.path.join(results_dir, ckpt_epoch) + ckpt_no = ckpt_epoch[6:] + + q_string = q_vocab_dict.token_idx_2_string(question) + frames_with_text = [] + for frame in imgs: + border_width = 32 + font = cv2.FONT_HERSHEY_SIMPLEX + color = (0, 0, 0) + scale = 0.3 + thickness = 1 + + frame = cv2.copyMakeBorder( + frame, + border_width, + border_width, + border_width, + border_width, + cv2.BORDER_CONSTANT, + value=(255, 255, 255), + ) + + frame = cv2.putText( + frame, + "Question: " + q_string, + (10, 15), + font, + scale, + color, + thickness, + ) + + frames_with_text.append(frame) + generate_video( + video_option, + results_dir, + frames_with_text, + ep_id, + ckpt_no, + {}, + writer, + fps=5, + ) + + def train(self) -> None: + r"""Main method for training Navigation model of EQA. + + Returns: + None + """ + config = self.config + + with habitat.Env(config.TASK_CONFIG) as env: + nav_dataset = ( + NavDataset( + config, + env, + self.device, + ) + .shuffle(1000) + .decode("rgb") + ) + + nav_dataset = nav_dataset.map(nav_dataset.map_dataset_sample) + + train_loader = DataLoader( + nav_dataset, batch_size=config.IL.NAV.batch_size + ) + + logger.info("train_loader has {} samples".format(len(nav_dataset))) + + q_vocab_dict, _ = nav_dataset.get_vocab_dicts() + + model_kwargs = {"q_vocab": q_vocab_dict.word2idx_dict} + model = NavPlannerControllerModel(**model_kwargs) + + planner_loss_fn = MaskedNLLCriterion() + controller_loss_fn = MaskedNLLCriterion() + + optim = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=float(config.IL.NAV.lr), + ) + + metrics = NavMetric( + info={"split": "train"}, + metric_names=["planner_loss", "controller_loss"], + log_json=os.path.join(config.OUTPUT_LOG_DIR, "train.json"), + ) + + epoch = 1 + + avg_p_loss = 0.0 + avg_c_loss = 0.0 + + logger.info(model) + model.train().to(self.device) + + with TensorboardWriter( + "train_{}/{}".format( + config.TENSORBOARD_DIR, + datetime.today().strftime("%Y-%m-%d-%H:%M"), + ), + flush_secs=self.flush_secs, + ) as writer: + while epoch <= config.IL.NAV.max_epochs: + start_time = time.time() + for t, batch in enumerate(train_loader): + batch = ( + item.to(self.device, non_blocking=True) + for item in batch + ) + ( + idx, + questions, + _, + planner_img_feats, + planner_actions_in, + planner_actions_out, + planner_action_lengths, + planner_masks, + controller_img_feats, + controller_actions_in, + planner_hidden_idx, + controller_outs, + controller_action_lengths, + controller_masks, + ) = batch + + ( + planner_action_lengths, + perm_idx, + ) = planner_action_lengths.sort(0, descending=True) + questions = questions[perm_idx] + + planner_img_feats = planner_img_feats[perm_idx] + planner_actions_in = planner_actions_in[perm_idx] + planner_actions_out = planner_actions_out[perm_idx] + planner_masks = planner_masks[perm_idx] + + controller_img_feats = controller_img_feats[perm_idx] + controller_actions_in = controller_actions_in[perm_idx] + controller_outs = controller_outs[perm_idx] + planner_hidden_idx = planner_hidden_idx[perm_idx] + controller_action_lengths = controller_action_lengths[ + perm_idx + ] + controller_masks = controller_masks[perm_idx] + + ( + planner_scores, + controller_scores, + planner_hidden, + ) = model( + questions, + planner_img_feats, + planner_actions_in, + planner_action_lengths.cpu().numpy(), + planner_hidden_idx, + controller_img_feats, + controller_actions_in, + controller_action_lengths, + ) + + planner_logprob = F.log_softmax(planner_scores, dim=1) + controller_logprob = F.log_softmax( + controller_scores, dim=1 + ) + + planner_loss = planner_loss_fn( + planner_logprob, + planner_actions_out[ + :, : planner_action_lengths.max() + ].reshape(-1, 1), + planner_masks[ + :, : planner_action_lengths.max() + ].reshape(-1, 1), + ) + + controller_loss = controller_loss_fn( + controller_logprob, + controller_outs[ + :, : controller_action_lengths.max() + ].reshape(-1, 1), + controller_masks[ + :, : controller_action_lengths.max() + ].reshape(-1, 1), + ) + + # zero grad + optim.zero_grad() + + # update metrics + metrics.update( + [planner_loss.item(), controller_loss.item()] + ) + + (planner_loss + controller_loss).backward() + + optim.step() + + (planner_loss, controller_loss) = metrics.get_stats() + + avg_p_loss += planner_loss + avg_c_loss += controller_loss + + if t % config.LOG_INTERVAL == 0: + logger.info("Epoch: {}".format(epoch)) + logger.info(metrics.get_stat_string()) + + writer.add_scalar("planner loss", planner_loss, t) + writer.add_scalar( + "controller loss", controller_loss, t + ) + + metrics.dump_log() + + # Dataloader length for IterableDataset doesn't take into + # account batch size for Pytorch v < 1.6.0 + num_batches = math.ceil( + len(nav_dataset) / config.IL.NAV.batch_size + ) + + avg_p_loss /= num_batches + avg_c_loss /= num_batches + + end_time = time.time() + time_taken = "{:.1f}".format((end_time - start_time) / 60) + + logger.info( + "Epoch {} completed. Time taken: {} minutes.".format( + epoch, time_taken + ) + ) + + logger.info( + "Average planner loss: {:.2f}".format(avg_p_loss) + ) + logger.info( + "Average controller loss: {:.2f}".format(avg_c_loss) + ) + + print("-----------------------------------------") + + if epoch % config.CHECKPOINT_INTERVAL == 0: + self.save_checkpoint( + model.state_dict(), "epoch_{}.ckpt".format(epoch) + ) + + epoch += 1 + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + r"""Evaluates a single checkpoint. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + config = self.config + + config.defrost() + config.TASK_CONFIG.DATASET.SPLIT = self.config.EVAL.SPLIT + config.freeze() + + with habitat.Env(config.TASK_CONFIG) as env: + nav_dataset = NavDataset( + config, + env, + self.device, + ).decode("rgb") + + nav_dataset = nav_dataset.map(nav_dataset.map_dataset_sample) + + eval_loader = DataLoader(nav_dataset) + + logger.info("eval_loader has {} samples".format(len(nav_dataset))) + + q_vocab_dict, ans_vocab_dict = nav_dataset.get_vocab_dicts() + + model_kwargs = {"q_vocab": q_vocab_dict.word2idx_dict} + model = NavPlannerControllerModel(**model_kwargs) + + invalids = [] + + state_dict = torch.load(checkpoint_path) + model.load_state_dict(state_dict) + model.eval().to(self.device) + + results_dir = config.RESULTS_DIR.format(split="val") + video_option = self.config.VIDEO_OPTION + + metrics = NavMetric( + info={"split": "val"}, + metric_names=[ + "{}_{}".format(y, x) + for x in [10, 30, 50, "rand_init"] + for z in ["", "_f"] + for y in [ + *["d_{}{}".format(k, z) for k in [0, "T", "D", "min"]], + *[w for w in ["stop", "ep_len"] if z == ""], + ] + ], + log_json=os.path.join(config.OUTPUT_LOG_DIR, "eval.json"), + ) + + for t, batch in enumerate(eval_loader): + idx, question, answer, actions, action_length, goal_pos = batch + + metrics_slug = {} + imgs = [] + for i in [10, 30, 50, "rand_init"]: + for j in ["pred", "fwd-only"]: + question = question.to(self.device) + + controller_step = False + planner_hidden = model.planner_nav_rnn.init_hidden(1) + + # get hierarchical action history + ( + planner_actions_in, + planner_img_feats, + controller_step, + controller_action_in, + controller_img_feats, + init_pos, + controller_action_counter, + ) = nav_dataset.get_hierarchical_features_till_spawn( + idx.item(), + actions[0, : action_length.item()].numpy(), + i if i != "rand_init" else action_length.item(), + config.IL.NAV.max_controller_actions, + ) + if j == "pred": + planner_actions_in = planner_actions_in.to( + self.device + ) + planner_img_feats = planner_img_feats.to( + self.device + ) + + for step in range(planner_actions_in.size(0)): + + ( + planner_scores, + planner_hidden, + ) = model.planner_step( + question, + planner_img_feats[step][ + (None,) * 2 + ], # unsqueezing twice + planner_actions_in[step].view(1, 1), + planner_hidden, + ) + + env.sim.set_agent_state( + init_pos.position, init_pos.rotation + ) + init_dist_to_target = env.sim.geodesic_distance( + init_pos.position, goal_pos + ) + + if ( + init_dist_to_target < 0 + or init_dist_to_target == float("inf") + ): # unreachable + invalids.append([idx.item(), i]) + continue + + dists_to_target, pos_queue = [init_dist_to_target], [ + init_pos + ] + if j == "pred": + planner_actions, controller_actions = [], [] + + if config.IL.NAV.max_controller_actions > 1: + controller_action_counter = ( + controller_action_counter + % config.IL.NAV.max_controller_actions + ) + controller_action_counter = max( + controller_action_counter - 1, 0 + ) + else: + controller_action_counter = 0 + + first_step = True + first_step_is_controller = controller_step + planner_step = True + action = int(controller_action_in) + + img = None + for episode_length in range( + config.IL.NAV.max_episode_length + ): + if j == "pred": + if not first_step: + if ( + i == 30 + ): # saving results for 30-step walked back case + imgs.append(img) + img_feat = ( + eval_loader.dataset.get_img_features( + img, preprocess=True + ).view(1, 1, 4608) + ) + else: + img_feat = controller_img_feats.to( + self.device + ).view(1, 1, 4608) + + if not first_step or first_step_is_controller: + # query controller to continue or not + controller_action_in = ( + torch.LongTensor(1, 1) + .fill_(action) + .to(self.device) + ) + controller_scores = model.controller_step( + img_feat, + controller_action_in, + planner_hidden[0], + ) + + prob = F.softmax(controller_scores, dim=1) + controller_action = int( + prob.max(1)[1].data.cpu().numpy()[0] + ) + + if ( + controller_action == 1 + and controller_action_counter + < config.IL.NAV.max_controller_actions + - 1 + ): + controller_action_counter += 1 + planner_step = False + else: + controller_action_counter = 0 + planner_step = True + controller_action = 0 + + controller_actions.append( + controller_action + ) + first_step = False + + if planner_step: + if not first_step: + action_in = ( + torch.LongTensor(1, 1) + .fill_(action + 1) + .to(self.device) + ) + ( + planner_scores, + planner_hidden, + ) = model.planner_step( + question, + img_feat, + action_in, + planner_hidden, + ) + prob = F.softmax(planner_scores, dim=1) + action = int( + prob.max(1)[1].data.cpu().numpy()[0] + ) + planner_actions.append(action) + + else: + action = 0 + + episode_done = ( + action == 3 + or episode_length + >= config.IL.NAV.max_episode_length + ) + + agent_pos = env.sim.get_agent_state().position + + dists_to_target.append( + env.sim.geodesic_distance(agent_pos, goal_pos) + ) + pos_queue.append([agent_pos]) + + if episode_done: + break + + if action == 0: + my_action = 1 # forward + elif action == 1: + my_action = 2 # left + elif action == 2: + my_action = 3 # right + elif action == 3: + my_action = 0 # stop + + observations = env.sim.step(my_action) + img = observations["rgb"] + first_step = False + + # compute stats + m = "" if j == "pred" else "_f" + metrics_slug[ + "d_T{}_{}".format(m, i) + ] = dists_to_target[-1] + metrics_slug["d_D{}_{}".format(m, i)] = ( + dists_to_target[0] - dists_to_target[-1] + ) + metrics_slug["d_min{}_{}".format(m, i)] = np.array( + dists_to_target + ).min() + + if j != "fwd-only": + metrics_slug[ + "ep_len_{}".format(i) + ] = episode_length + if action == 3: + metrics_slug["stop_{}".format(i)] = 1 + else: + metrics_slug["stop_{}".format(i)] = 0 + + metrics_slug["d_0_{}".format(i)] = dists_to_target[ + 0 + ] + + # collate and update metrics + metrics_list = [] + for ind, i in enumerate(metrics.metric_names): + if i not in metrics_slug: + metrics_list.append(metrics.metrics[ind][0]) + else: + metrics_list.append(metrics_slug[i]) + + # update metrics + metrics.update(metrics_list) + + if t % config.LOG_INTERVAL == 0: + logger.info( + "Valid cases: {}; Invalid cases: {}".format( + (t + 1) * 8 - len(invalids), len(invalids) + ) + ) + logger.info( + "EVAL: Avg metrics: {}".format( + metrics.get_stat_string(mode=0) + ) + ) + print( + "-----------------------------------------------------" + ) + + if ( + config.EVAL_SAVE_RESULTS + and t % config.EVAL_SAVE_RESULTS_INTERVAL == 0 + ): + q_string = q_vocab_dict.token_idx_2_string(question[0]) + logger.info("Question: {}".format(q_string)) + + self._save_nav_results( + checkpoint_path, + t, + question, + imgs, + q_vocab_dict, + results_dir, + writer, + video_option, + ) diff --git a/habitat-lab-dialog/habitat_baselines/il/trainers/vqa_trainer.py b/habitat-lab-dialog/habitat_baselines/il/trainers/vqa_trainer.py new file mode 100644 index 0000000..2b09d55 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/il/trainers/vqa_trainer.py @@ -0,0 +1,420 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +import os +import time + +import torch +from torch.utils.data import DataLoader + +from habitat import logger +from habitat.datasets.utils import VocabDict +from habitat_baselines.common.base_il_trainer import BaseILTrainer +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.common.tensorboard_utils import TensorboardWriter +from habitat_baselines.il.data.data import EQADataset +from habitat_baselines.il.metrics import VqaMetric +from habitat_baselines.il.models.models import VqaLstmCnnAttentionModel +from habitat_baselines.utils.common import img_bytes_2_np_array +from habitat_baselines.utils.visualizations.utils import save_vqa_image_results + + +@baseline_registry.register_trainer(name="vqa") +class VQATrainer(BaseILTrainer): + r"""Trainer class for VQA model used in EmbodiedQA (Das et. al.; CVPR 2018) + Paper: https://embodiedqa.org/paper.pdf. + """ + supported_tasks = ["VQA-v0"] + + def __init__(self, config=None): + super().__init__(config) + + self.device = ( + torch.device("cuda", self.config.TORCH_GPU_ID) + if torch.cuda.is_available() + else torch.device("cpu") + ) + + if config is not None: + logger.info(f"config: {config}") + + def _make_results_dir(self): + r"""Makes directory for saving VQA eval results.""" + dir_name = self.config.RESULTS_DIR.format(split="val") + os.makedirs(dir_name, exist_ok=True) + + def _save_vqa_results( + self, + ckpt_idx: int, + episode_ids: torch.Tensor, + questions: torch.Tensor, + images: torch.Tensor, + pred_scores: torch.Tensor, + gt_answers: torch.Tensor, + q_vocab_dict: VocabDict, + ans_vocab_dict: VocabDict, + ) -> None: + + r"""For saving VQA results. + Args: + ckpt_idx: idx of checkpoint being evaluated + episode_ids: episode ids of batch + questions: input questions to model + images: images' tensor containing input frames + pred_scores: model prediction scores + gt_answers: ground truth answers + ground_truth: ground truth answer + q_vocab_dict: Question VocabDict + ans_vocab_dict: Answer VocabDict + + Returns: + None + """ + episode_id = episode_ids[0].item() + question = questions[0] + images = images[0] + gt_answer = gt_answers[0] + scores = pred_scores[0] + + q_string = q_vocab_dict.token_idx_2_string(question) + + _, index = scores.max(0) + pred_answer = sorted(ans_vocab_dict.word2idx_dict.keys())[index] + gt_answer = sorted(ans_vocab_dict.word2idx_dict.keys())[gt_answer] + + logger.info("Question: {}".format(q_string)) + logger.info("Predicted answer: {}".format(pred_answer)) + logger.info("Ground-truth answer: {}".format(gt_answer)) + + result_path = self.config.RESULTS_DIR.format( + split=self.config.TASK_CONFIG.DATASET.SPLIT + ) + + result_path = os.path.join( + result_path, "ckpt_{}_{}_image.jpg".format(ckpt_idx, episode_id) + ) + + save_vqa_image_results( + images, q_string, pred_answer, gt_answer, result_path + ) + + def train(self) -> None: + r"""Main method for training VQA (Answering) model of EQA. + + Returns: + None + """ + config = self.config + + # env = habitat.Env(config=config.TASK_CONFIG) + + vqa_dataset = ( + EQADataset( + config, + input_type="vqa", + num_frames=config.IL.VQA.num_frames, + ) + .shuffle(1000) + .to_tuple( + "episode_id", + "question", + "answer", + *["{0:0=3d}.jpg".format(x) for x in range(0, 5)], + ) + .map(img_bytes_2_np_array) + ) + + train_loader = DataLoader( + vqa_dataset, batch_size=config.IL.VQA.batch_size + ) + + logger.info("train_loader has {} samples".format(len(vqa_dataset))) + + q_vocab_dict, ans_vocab_dict = vqa_dataset.get_vocab_dicts() + + model_kwargs = { + "q_vocab": q_vocab_dict.word2idx_dict, + "ans_vocab": ans_vocab_dict.word2idx_dict, + "eqa_cnn_pretrain_ckpt_path": config.EQA_CNN_PRETRAIN_CKPT_PATH, + "freeze_encoder": config.IL.VQA.freeze_encoder, + } + + model = VqaLstmCnnAttentionModel(**model_kwargs) + + lossFn = torch.nn.CrossEntropyLoss() + + optim = torch.optim.Adam( + filter(lambda p: p.requires_grad, model.parameters()), + lr=float(config.IL.VQA.lr), + ) + + metrics = VqaMetric( + info={"split": "train"}, + metric_names=[ + "loss", + "accuracy", + "mean_rank", + "mean_reciprocal_rank", + ], + log_json=os.path.join(config.OUTPUT_LOG_DIR, "train.json"), + ) + + t, epoch = 0, 1 + + avg_loss = 0.0 + avg_accuracy = 0.0 + avg_mean_rank = 0.0 + avg_mean_reciprocal_rank = 0.0 + + logger.info(model) + model.train().to(self.device) + + if config.IL.VQA.freeze_encoder: + model.cnn.eval() + + with TensorboardWriter( + config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) as writer: + while epoch <= config.IL.VQA.max_epochs: + start_time = time.time() + for batch in train_loader: + t += 1 + _, questions, answers, frame_queue = batch + optim.zero_grad() + + questions = questions.to(self.device) + answers = answers.to(self.device) + frame_queue = frame_queue.to(self.device) + + scores, _ = model(frame_queue, questions) + loss = lossFn(scores, answers) + + # update metrics + accuracy, ranks = metrics.compute_ranks( + scores.data.cpu(), answers + ) + metrics.update([loss.item(), accuracy, ranks, 1.0 / ranks]) + + loss.backward() + optim.step() + + ( + metrics_loss, + accuracy, + mean_rank, + mean_reciprocal_rank, + ) = metrics.get_stats() + + avg_loss += metrics_loss + avg_accuracy += accuracy + avg_mean_rank += mean_rank + avg_mean_reciprocal_rank += mean_reciprocal_rank + + if t % config.LOG_INTERVAL == 0: + logger.info("Epoch: {}".format(epoch)) + logger.info(metrics.get_stat_string()) + + writer.add_scalar("loss", metrics_loss, t) + writer.add_scalar("accuracy", accuracy, t) + writer.add_scalar("mean_rank", mean_rank, t) + writer.add_scalar( + "mean_reciprocal_rank", mean_reciprocal_rank, t + ) + + metrics.dump_log() + + # Dataloader length for IterableDataset doesn't take into + # account batch size for Pytorch v < 1.6.0 + num_batches = math.ceil( + len(vqa_dataset) / config.IL.VQA.batch_size + ) + + avg_loss /= num_batches + avg_accuracy /= num_batches + avg_mean_rank /= num_batches + avg_mean_reciprocal_rank /= num_batches + + end_time = time.time() + time_taken = "{:.1f}".format((end_time - start_time) / 60) + + logger.info( + "Epoch {} completed. Time taken: {} minutes.".format( + epoch, time_taken + ) + ) + + logger.info("Average loss: {:.2f}".format(avg_loss)) + logger.info("Average accuracy: {:.2f}".format(avg_accuracy)) + logger.info("Average mean rank: {:.2f}".format(avg_mean_rank)) + logger.info( + "Average mean reciprocal rank: {:.2f}".format( + avg_mean_reciprocal_rank + ) + ) + + print("-----------------------------------------") + + self.save_checkpoint( + model.state_dict(), "epoch_{}.ckpt".format(epoch) + ) + + epoch += 1 + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + r"""Evaluates a single checkpoint. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + config = self.config + + config.defrost() + config.TASK_CONFIG.DATASET.SPLIT = self.config.EVAL.SPLIT + config.freeze() + + vqa_dataset = ( + EQADataset( + config, + input_type="vqa", + num_frames=config.IL.VQA.num_frames, + ) + .shuffle(1000) + .to_tuple( + "episode_id", + "question", + "answer", + *["{0:0=3d}.jpg".format(x) for x in range(0, 5)], + ) + .map(img_bytes_2_np_array) + ) + + eval_loader = DataLoader( + vqa_dataset, batch_size=config.IL.VQA.batch_size + ) + + logger.info("eval_loader has {} samples".format(len(vqa_dataset))) + + q_vocab_dict, ans_vocab_dict = vqa_dataset.get_vocab_dicts() + + model_kwargs = { + "q_vocab": q_vocab_dict.word2idx_dict, + "ans_vocab": ans_vocab_dict.word2idx_dict, + "eqa_cnn_pretrain_ckpt_path": config.EQA_CNN_PRETRAIN_CKPT_PATH, + } + model = VqaLstmCnnAttentionModel(**model_kwargs) + + state_dict = torch.load( + checkpoint_path, map_location={"cuda:0": "cpu"} + ) + model.load_state_dict(state_dict) + + lossFn = torch.nn.CrossEntropyLoss() + + t = 0 + + avg_loss = 0.0 + avg_accuracy = 0.0 + avg_mean_rank = 0.0 + avg_mean_reciprocal_rank = 0.0 + + model.eval() + model.cnn.eval() + model.to(self.device) + + metrics = VqaMetric( + info={"split": "val"}, + metric_names=[ + "loss", + "accuracy", + "mean_rank", + "mean_reciprocal_rank", + ], + log_json=os.path.join(config.OUTPUT_LOG_DIR, "eval.json"), + ) + with torch.no_grad(): + for batch in eval_loader: + t += 1 + episode_ids, questions, answers, frame_queue = batch + questions = questions.to(self.device) + answers = answers.to(self.device) + frame_queue = frame_queue.to(self.device) + + scores, _ = model(frame_queue, questions) + + loss = lossFn(scores, answers) + + accuracy, ranks = metrics.compute_ranks( + scores.data.cpu(), answers + ) + metrics.update([loss.item(), accuracy, ranks, 1.0 / ranks]) + + ( + metrics_loss, + accuracy, + mean_rank, + mean_reciprocal_rank, + ) = metrics.get_stats(mode=0) + + avg_loss += metrics_loss + avg_accuracy += accuracy + avg_mean_rank += mean_rank + avg_mean_reciprocal_rank += mean_reciprocal_rank + + if t % config.LOG_INTERVAL == 0: + logger.info(metrics.get_stat_string(mode=0)) + metrics.dump_log() + + if ( + config.EVAL_SAVE_RESULTS + and t % config.EVAL_SAVE_RESULTS_INTERVAL == 0 + ): + + self._save_vqa_results( + checkpoint_index, + episode_ids, + questions, + frame_queue, + scores, + answers, + q_vocab_dict, + ans_vocab_dict, + ) + + num_batches = math.ceil(len(vqa_dataset) / config.IL.VQA.batch_size) + + avg_loss /= num_batches + avg_accuracy /= num_batches + avg_mean_rank /= num_batches + avg_mean_reciprocal_rank /= num_batches + + writer.add_scalar("avg val loss", avg_loss, checkpoint_index) + writer.add_scalar("avg val accuracy", avg_accuracy, checkpoint_index) + writer.add_scalar("avg val mean rank", avg_mean_rank, checkpoint_index) + writer.add_scalar( + "avg val mean reciprocal rank", + avg_mean_reciprocal_rank, + checkpoint_index, + ) + + logger.info("Average loss: {:.2f}".format(avg_loss)) + logger.info("Average accuracy: {:.2f}".format(avg_accuracy)) + logger.info("Average mean rank: {:.2f}".format(avg_mean_rank)) + logger.info( + "Average mean reciprocal rank: {:.2f}".format( + avg_mean_reciprocal_rank + ) + ) diff --git a/habitat-lab-dialog/habitat_baselines/py.typed b/habitat-lab-dialog/habitat_baselines/py.typed new file mode 100644 index 0000000..abe48a5 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561. This tells mypy that the package uses inline types. diff --git a/habitat-lab-dialog/habitat_baselines/rl/__init__.py b/habitat-lab-dialog/habitat_baselines/rl/__init__.py new file mode 100644 index 0000000..240697e --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/README.md b/habitat-lab-dialog/habitat_baselines/rl/ddppo/README.md new file mode 100644 index 0000000..380a4ae --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/README.md @@ -0,0 +1,71 @@ +# Decentralized Distributed PPO + +Provides changes to the core baseline ppo algorithm and training script to implemented Decentralized Distributed PPO (DD-PPO). +DD-PPO leverages distributed data parallelism to seamlessly scale PPO to hundreds of GPUs with no centralized server. + +See the [paper](https://arxiv.org/abs/1911.00357) for more detail. + +## Running + +There are two example scripts to run provided. A single node script that leverages `torch.distributed.launch` to create multiple workers: +`single_node.sh`, and a multi-node script that leverages [SLURM](https://slurm.schedmd.com/documentation.html) to create all the works on multiple nodes: `multi_node_slurm.sh`. + +The two recommended backends are GLOO and NCCL. Use NCCL if your system has it, and GLOO if otherwise. + +See [pytorch's distributed docs](https://pytorch.org/docs/stable/distributed.html#backends-that-come-with-pytorch) +and [pytorch's distributed tutorial](https://pytorch.org/tutorials/intermediate/dist_tuto.html) for more information. + +### Verifying gradient reduction + +Due to the different nature of RL than supervised learning, the way DD-PPO interfaces with PyTorch's DistributedDataParallel is slightly off the beaten path and while it is reasonably robust new versions of pytorch have broken it in the past. Our CI does not test against every version of pytorch, so if there ever concern that gradient may not be working, run the unit test locally: + +``` +pytest test/test_ddppo_reduce.py +``` + +## Pretrained Models (PointGoal Navigation with GPS+Compass) + + +All weights available as a zip [here](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models.zip). + +### Depth models + +| Architecture | Training Data | Val SPL | Test SPL | URL | +| ------------ | ------------- | ------- | -------- | --- | +| ResNet50 + LSTM512 | Gibson 4+ | 0.922 | 0.917 | [gibson-4plus-resnet50.pth](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models/gibson-4plus-resnet50.pth) | +| ResNet50 + LSTM512 | Gibson 4+ and MP3D(train/val/test)
**Caution:** Trained on MP3D val and test | 0.956 | 0.941 | [gibson-4plus-mp3d-train-val-test-resnet50.pth](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models/gibson-4plus-mp3d-train-val-test-resnet50.pth) | +| ResNet50 + LSTM512 | Gibson 2+ | 0.956 | 0.944 | [gibson-2plus-resnet50.pth](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models/gibson-2plus-resnet50.pth)| +| SE-ResNeXt50 + LSTM512 | Gibson 2+ | 0.959 | 0.943 | [gibson-2plus-se-resneXt50.pth](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models/gibson-2plus-se-resneXt50.pth)| +| SE-ResNeXt101 + LSTM1024 | Gibson 2+ | 0.969 | 0.948 | [gibson-2plus-se-resneXt101-lstm1024.pth](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models/gibson-2plus-se-resneXt101-lstm1024.pth)| + +### RGB models + +| Architecture | Training Data | Val SPL | Test SPL | URL | +| ------------ | ------------- | ------- | -------- | --- | +| SE-ResNeXt50 + LSTM512 | Gibson 2+ and MP3D(train/val/test)
**Caution:** Trained on MP3D val and test | 0.933 | 0.920 | [gibson-2plus-mp3d-train-val-test-se-resneXt50-rgb.pth](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models/gibson-2plus-mp3d-train-val-test-se-resneXt50-rgb.pth) | + + +### Blind Models + +| Architecture | Training Data | Val SPL | Test SPL | URL | +| ------------ | ------------- | ------- | -------- | --- | +| LSTM512 | Gibson 0+ and MP3D(train/val/test)
**Caution:** Trained on MP3D val and test | 0.729 | 0.676 | [gibson-0plus-mp3d-train-val-test-blind.pth](https://dl.fbaipublicfiles.com/habitat/data/baselines/v1/ddppo/ddppo-models/gibson-0plus-mp3d-train-val-test-blind.pth) | + + + + +**Note:** Evaluation was done with *sampled* actions. + +All model weights are subject to [Matterport3D Terms-of-Use](http://dovahkiin.stanford.edu/matterport/public/MP_TOS.pdf). + + +## Citing + +If you use DD-PPO or the model-weights in your research, please cite the following [paper](https://arxiv.org/abs/1911.00357): + + @article{wijmans2020ddppo, + title = {{DD-PPO}: {L}earning Near-Perfect PointGoal Navigators from 2.5 Billion Frames}, + author = {Erik Wijmans and Abhishek Kadian and Ari Morcos and Stefan Lee and Irfan Essa and Devi Parikh and Manolis Savva and Dhruv Batra}, + journal = {International Conference on Learning Representations (ICLR)}, + year = {2020} + } diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/__init__.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/__init__.py new file mode 100644 index 0000000..abc5be9 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from habitat_baselines.rl.ddppo.policy.resnet_policy import ( + PointNavResNetPolicy, +) diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/__init__.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/__init__.py new file mode 100644 index 0000000..d38c743 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat_baselines.rl.ddppo.algo.ddppo import DDPPO diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/ddp_utils.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/ddp_utils.py new file mode 100644 index 0000000..dd12e78 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/ddp_utils.py @@ -0,0 +1,254 @@ +import functools +import os +import shlex +import signal +import subprocess +import threading +from os import path as osp +from typing import Any, Callable, Optional, Tuple, Union, overload + +import ifcfg +import torch +from torch import distributed as distrib + +from habitat import logger + +EXIT = threading.Event() +EXIT.clear() +REQUEUE = threading.Event() +REQUEUE.clear() + + +# Default port to initialized the TCP store on +DEFAULT_PORT = 8738 +DEFAULT_PORT_RANGE = 127 +# Default address of world rank 0 +DEFAULT_MASTER_ADDR = "127.0.0.1" + +SLURM_JOBID = os.environ.get("SLURM_JOB_ID", None) +INTERRUPTED_STATE_FILE = osp.join( + os.environ["HOME"], ".interrupted_states", f"{SLURM_JOBID}.pth" +) + + +def is_slurm_job() -> bool: + return SLURM_JOBID is not None + + +def is_slurm_batch_job() -> bool: + r"""Heuristic to determine if a slurm job is a batch job or not. Batch jobs + will have a job name that is not a shell unless the user specifically set the job + name to that of a shell. Interactive jobs have a shell name as their job name. + """ + return is_slurm_job() and os.environ.get("SLURM_JOB_NAME", None) not in ( + None, + "bash", + "zsh", + "fish", + "tcsh", + "sh", + ) + + +@overload +def rank0_only() -> bool: + ... + + +@overload +def rank0_only(fn: Callable) -> Callable: + ... + + +def rank0_only(fn: Optional[Callable] = None) -> Union[Callable, bool]: + r"""Helper function to only execute code if a process is world rank 0 + + Can be used both as a function in an if statement, + + .. code:: py + + if rank0_only(): + ... + + or as a decorator, + + .. code:: py + + @rank0_only + def fn_for_r0_only(...): + ... + + :param fn: Function to wrap and only execute if the process is rank 0. + If a process is rank 0, the function will be run and it's return value + will be returned. If a process is not rank 0, then the function will not + be ran and :py:`None` will be returned. + + :return: The wrapped function if :p:`fn` is not :py:`None`, otherwise + whether or not this process is rank 0 + """ + if fn is None: + return ( + not torch.distributed.is_initialized() + or torch.distributed.get_rank() == 0 + ) + + @functools.wraps(fn) + def _wrapper(*args, **kwargs): + if rank0_only(): + return fn(*args, **kwargs) + return None + + return _wrapper + + +def _clean_exit_handler(signum, frame): + EXIT.set() + print("Exiting cleanly", flush=True) + + +def _requeue_handler(signal, frame): + print("Got signal to requeue", flush=True) + EXIT.set() + REQUEUE.set() + + +def add_signal_handlers() -> None: + signal.signal(signal.SIGINT, _clean_exit_handler) + signal.signal(signal.SIGTERM, _clean_exit_handler) + + # SIGUSR2 can be sent to all processes to have them cleanup + # and exit nicely. This is nice to use with SLURM as scancel + # sets a 30 second timer for the job to exit, and it can take more than + # 30 seconds for the job to cleanup and exit nicely. When using NCCL, + # forcing the job to exit without cleaning up can be bad. + # scancel --signal SIGUSR2 will set no such timer and will give + # the job ample time to cleanup and exit. + signal.signal(signal.SIGUSR2, _clean_exit_handler) + + signal.signal(signal.SIGUSR1, _requeue_handler) + + +@rank0_only +def save_interrupted_state(state: Any, filename: str = None): + r"""Saves the interrupted job state to the specified filename. + This is useful when working with preemptable job partitions. + + This method will do nothing if SLURM is not currently being used and the filename is the default + + :param state: The state to save + :param filename: The filename. Defaults to "${HOME}/.interrupted_states/${SLURM_JOBID}.pth" + """ + if SLURM_JOBID is None and filename is None: + logger.warn("SLURM_JOBID is none, not saving interrupted state") + return + + if filename is None: + filename = INTERRUPTED_STATE_FILE + if not osp.exists(osp.dirname(INTERRUPTED_STATE_FILE)): + raise RuntimeError( + "Please create a .interrupted_states directory in your home directory for job preemption" + "(This is intentionally not created automatically as it can get quite large)" + ) + + torch.save(state, filename) + + +def load_interrupted_state(filename: str = None) -> Optional[Any]: + r"""Loads the saved interrupted state + + :param filename: The filename of the saved state. + Defaults to "${HOME}/.interrupted_states/${SLURM_JOBID}.pth" + + :return: The saved state if the file exists, else none + """ + if SLURM_JOBID is None and filename is None: + return None + + if filename is None: + filename = INTERRUPTED_STATE_FILE + + if not osp.exists(filename): + return None + + return torch.load(filename, map_location="cpu") + + +def requeue_job(): + r"""Requeues the job by calling ``scontrol requeue ${SLURM_JOBID}``""" + if SLURM_JOBID is None: + return + + if not REQUEUE.is_set(): + return + + if distrib.is_initialized(): + distrib.barrier() + + if rank0_only(): + logger.info(f"Requeueing job {SLURM_JOBID}") + subprocess.check_call(shlex.split(f"scontrol requeue {SLURM_JOBID}")) + + +def get_ifname() -> str: + return ifcfg.default_interface()["device"] + + +def get_distrib_size() -> Tuple[int, int, int]: + # Check to see if we should parse from torch.distributed.launch + if os.environ.get("LOCAL_RANK", None) is not None: + local_rank = int(os.environ["LOCAL_RANK"]) + world_rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + # Else parse from SLURM is using SLURM + elif os.environ.get("SLURM_JOBID", None) is not None: + local_rank = int(os.environ["SLURM_LOCALID"]) + world_rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NTASKS"]) + # Otherwise setup for just 1 process, this is nice for testing + else: + local_rank = 0 + world_rank = 0 + world_size = 1 + + return local_rank, world_rank, world_size + + +def init_distrib_slurm( + backend: str = "nccl", +) -> Tuple[int, torch.distributed.TCPStore]: # type: ignore + r"""Initializes torch.distributed by parsing environment variables set + by SLURM when ``srun`` is used or by parsing environment variables set + by torch.distributed.launch + + :param backend: Which torch.distributed backend to use + + :returns: Tuple of the local_rank (aka which GPU to use for this process) + and the TCPStore used for the rendezvous + """ + assert ( + torch.distributed.is_available() + ), "torch.distributed must be available" + + if "GLOO_SOCKET_IFNAME" not in os.environ: + os.environ["GLOO_SOCKET_IFNAME"] = get_ifname() + + if "NCCL_SOCKET_IFNAME" not in os.environ: + os.environ["NCCL_SOCKET_IFNAME"] = get_ifname() + + local_rank, world_rank, world_size = get_distrib_size() + + master_port = int(os.environ.get("MASTER_PORT", DEFAULT_PORT)) + if SLURM_JOBID is not None: + master_port += int(SLURM_JOBID) % int( + os.environ.get("MASTER_PORT_RANGE", DEFAULT_PORT_RANGE) + ) + master_addr = os.environ.get("MASTER_ADDR", DEFAULT_MASTER_ADDR) + + tcp_store = distrib.TCPStore( # type: ignore + master_addr, master_port, world_size, world_rank == 0 + ) + distrib.init_process_group( + backend, store=tcp_store, rank=world_rank, world_size=world_size + ) + + return local_rank, tcp_store diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/ddppo.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/ddppo.py new file mode 100644 index 0000000..dcddc28 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/algo/ddppo.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +from torch import distributed as distrib + +from habitat_baselines.common.rollout_storage import RolloutStorage +from habitat_baselines.rl.ppo import PPO + +EPS_PPO = 1e-5 + + +def distributed_mean_and_var( + values: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Computes the mean and variances of a tensor over multiple workers. + + This method is equivalent to first collecting all versions of values and + then computing the mean and variance locally over that + + :param values: (*,) shaped tensors to compute mean and variance over. Assumed + to be solely the workers local copy of this tensor, + the resultant mean and variance will be computed + over _all_ workers version of this tensor. + """ + assert distrib.is_initialized(), "Distributed must be initialized" + + world_size = distrib.get_world_size() + + mean = values.mean() + distrib.all_reduce(mean) + mean = mean / world_size + + var = (values - mean).pow(2).mean() + distrib.all_reduce(var) + var = var / world_size + + return mean, var + + +class _EvalActionsWrapper(torch.nn.Module): + r"""Wrapper on evaluate_actions that allows that to be called from forward. + This is needed to interface with DistributedDataParallel's forward call + """ + + def __init__(self, actor_critic): + super().__init__() + self.actor_critic = actor_critic + + def forward(self, *args, **kwargs): + return self.actor_critic.evaluate_actions(*args, **kwargs) + + +class DecentralizedDistributedMixin: + def _get_advantages_distributed( + self, rollouts: RolloutStorage + ) -> torch.Tensor: + advantages = ( + rollouts.buffers["returns"][: rollouts.current_rollout_step_idx] + - rollouts.buffers["value_preds"][ + : rollouts.current_rollout_step_idx + ] + ) + if not self.use_normalized_advantage: # type: ignore + return advantages + + mean, var = distributed_mean_and_var(advantages) + + return (advantages - mean) / (var.sqrt() + EPS_PPO) + + def init_distributed(self, find_unused_params: bool = True) -> None: + r"""Initializes distributed training for the model + + 1. Broadcasts the model weights from world_rank 0 to all other workers + 2. Adds gradient hooks to the model + + :param find_unused_params: Whether or not to filter out unused parameters + before gradient reduction. This *must* be True if + there are any parameters in the model that where unused in the + forward pass, otherwise the gradient reduction + will not work correctly. + """ + # NB: Used to hide the hooks from the nn.Module, + # so they don't show up in the state_dict + class Guard: + def __init__(self, model, device): + if torch.cuda.is_available(): + self.ddp = torch.nn.parallel.DistributedDataParallel( + model, + device_ids=[device], + output_device=device, + find_unused_parameters=find_unused_params, + ) + else: + self.ddp = torch.nn.parallel.DistributedDataParallel( + model, + find_unused_parameters=find_unused_params, + ) + + self._evaluate_actions_wrapper = Guard(_EvalActionsWrapper(self.actor_critic), self.device) # type: ignore + + def _evaluate_actions( + self, observations, rnn_hidden_states, prev_actions, masks, action + ): + r"""Internal method that calls Policy.evaluate_actions. This is used instead of calling + that directly so that that call can be overrided with inheritence + """ + return self._evaluate_actions_wrapper.ddp( + observations, rnn_hidden_states, prev_actions, masks, action + ) + + +class DDPPO(DecentralizedDistributedMixin, PPO): + pass diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/data_generation/create_gibson_large_dataset.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/data_generation/create_gibson_large_dataset.py new file mode 100644 index 0000000..94084cb --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/data_generation/create_gibson_large_dataset.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +"""This script is used to generate customized Gibson training splits for the +PointNav task. The scenes in Gibson are ranked from 1-5 based on the +reconstruction quality (see https://arxiv.org/pdf/1904.01201.pdf for more +details). This script generates data for all scenes with a minimum quality of +q_thresh. +""" +import glob +import gzip +import json +import multiprocessing +import os +from os import path as osp + +import tqdm + +import habitat +from habitat.datasets.pointnav.pointnav_generator import ( + generate_pointnav_episode, +) + +NUM_EPISODES_PER_SCENE = int(1e4) +# Sample all scenes with a minimum quality +QUAL_THRESH = 2 + + +def safe_mkdir(path): + try: + os.mkdir(path) + except OSError: + pass + + +def _generate_fn(scene): + cfg = habitat.get_config() + cfg.defrost() + cfg.SIMULATOR.SCENE = scene + cfg.SIMULATOR.AGENT_0.SENSORS = [] + cfg.freeze() + + sim = habitat.sims.make_sim("Sim-v0", config=cfg.SIMULATOR) + + dset = habitat.datasets.make_dataset("PointNav-v1") + dset.episodes = list( + generate_pointnav_episode( + sim, NUM_EPISODES_PER_SCENE, is_gen_shortest_path=False + ) + ) + for ep in dset.episodes: + ep.scene_id = ep.scene_id[len("./data/scene_datasets/") :] + + scene_key = scene.split("/")[-1].split(".")[0] + out_file = ( + f"./data/datasets/pointnav/gibson/v2/train_large/content/" + f"{scene_key}.json.gz" + ) + os.makedirs(osp.dirname(out_file), exist_ok=True) + with gzip.open(out_file, "wt") as f: + f.write(dset.to_json()) + + +def generate_gibson_large_dataset(): + # Load train / val statistics + with open( + osp.join(osp.dirname(__file__), "gibson_dset_with_qual.json"), "r" + ) as f: + dataset_statistics = json.load(f) + + gibson_large_scene_keys = [] + for k, v in dataset_statistics.items(): + qual = v["qual"] + if ( + v["split_full+"] == "train" + and qual is not None + and qual >= QUAL_THRESH + ): + gibson_large_scene_keys.append(k) + + scenes = glob.glob("./data/scene_datasets/gibson/*.glb") + # Filter out invalid scenes + _fltr = lambda x: x.split("/")[-1].split(".")[0] in gibson_large_scene_keys + scenes = list(filter(_fltr, scenes)) + print(f"Total number of training scenes: {len(scenes)}") + + safe_mkdir("./data/datasets/pointnav/gibson/v2/train_large") + with multiprocessing.Pool(8) as pool, tqdm.tqdm(total=len(scenes)) as pbar: + for _ in pool.imap_unordered(_generate_fn, scenes): + pbar.update() + + path = "./data/datasets/pointnav/gibson/v2/train_large/train_large.json.gz" + with gzip.open(path, "wt") as f: + json.dump(dict(episodes=[]), f) + + +if __name__ == "__main__": + generate_gibson_large_dataset() diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/data_generation/gibson_dset_with_qual.json b/habitat-lab-dialog/habitat_baselines/rl/ddppo/data_generation/gibson_dset_with_qual.json new file mode 100644 index 0000000..81d0cb4 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/data_generation/gibson_dset_with_qual.json @@ -0,0 +1 @@ +{"Adrian": {"id": "Adrian", "name": "model-546", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 393.795, "floor": 4, "navigation_complexity": 3.285, "room": 12, "ssa": 1.66}, "qual": 5}, "Airport": {"id": "Airport", "name": "model-190", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 581.725, "floor": 4, "navigation_complexity": 4.294, "room": 17, "ssa": 1.091}, "qual": 3}, "Akiak": {"id": "Akiak", "name": "model-294", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 123.683, "floor": 1, "navigation_complexity": 1.213, "room": 3, "ssa": 1.227}, "qual": 2}, "Albertville": {"id": "Albertville", "name": "model-117", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 266.125, "floor": 4, "navigation_complexity": 3.737, "room": 16, "ssa": 1.297}, "qual": 5}, "Aldine": {"id": "Aldine", "name": "model-417", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 287.108, "floor": 3, "navigation_complexity": 5.723, "room": 14, "ssa": 1.346}, "qual": 2}, "Aldrich": {"id": "Aldrich", "name": "model-76", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 915.082, "floor": 4, "navigation_complexity": 4.016, "room": 15, "ssa": 1.203}, "qual": 0}, "Alfred": {"id": "Alfred", "name": "model-566", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 850.512, "floor": 3, "navigation_complexity": 2.691, "room": 13, "ssa": 1.946}, "qual": 1}, "Allensville": {"id": "Allensville", "name": "model-489", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 80.306, "floor": 1, "navigation_complexity": 5.35, "room": 7, "ssa": 0.885}, "qual": 3}, "Almena": {"id": "Almena", "name": "model-519", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 170.554, "floor": 1, "navigation_complexity": 3.88, "room": 9, "ssa": 1.536}, "qual": 3}, "Almota": {"id": "Almota", "name": "model-313", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 414.056, "floor": 4, "navigation_complexity": 1.066, "room": 17, "ssa": 0.785}, "qual": 3}, "Aloha": {"id": "Aloha", "name": "model-8", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 111.735, "floor": 2, "navigation_complexity": 3.723, "room": 8, "ssa": 1.741}, "qual": 3}, "Alstown": {"id": "Alstown", "name": "model-97", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 426.819, "floor": 4, "navigation_complexity": 1.43, "room": 18, "ssa": 1.372}, "qual": 1}, "American": {"id": "American", "name": "model-293", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 583.318, "floor": 3, "navigation_complexity": 1.93, "room": 20, "ssa": 1.227}, "qual": 3}, "Anaheim": {"id": "Anaheim", "name": "model-479", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 299.527, "floor": 4, "navigation_complexity": 2.755, "room": 18, "ssa": 1.128}, "qual": 4}, "Ancor": {"id": "Ancor", "name": "model-216", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 159.084, "floor": 4, "navigation_complexity": 1.858, "room": 15, "ssa": 1.992}, "qual": 3}, "Andover": {"id": "Andover", "name": "model-130", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 674.595, "floor": 4, "navigation_complexity": 3.514, "room": 19, "ssa": 1.208}, "qual": 4}, "Angiola": {"id": "Angiola", "name": "model-443", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 18.153, "floor": 1, "navigation_complexity": 1.264, "room": 3, "ssa": 1.457}, "qual": 5}, "Annawan": {"id": "Annawan", "name": "model-402", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 75.039, "floor": 1, "navigation_complexity": 3.68, "room": 7, "ssa": 0.872}, "qual": 5}, "Annona": {"id": "Annona", "name": "model-243", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 359.046, "floor": 3, "navigation_complexity": 2.193, "room": 16, "ssa": 1.105}, "qual": 3}, "Anthoston": {"id": "Anthoston", "name": "model-146", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 477.51, "floor": 1, "navigation_complexity": 4.809, "room": 16, "ssa": 1.561}, "qual": 1}, "Ackermanville": {"id": "Ackermanville", "name": "model-123", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 499.522, "floor": 1, "navigation_complexity": 2.943, "room": 1, "ssa": 1.187}, "qual": 0}, "Adairsville": {"id": "Adairsville", "name": "model-17", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 232.438, "floor": 5, "navigation_complexity": 2.201, "room": 17, "ssa": 1.84}, "qual": 3}, "Apache": {"id": "Apache", "name": "model-543", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1024.699, "floor": 4, "navigation_complexity": 6.585, "room": 34, "ssa": 2.011}, "qual": 0}, "Applewold": {"id": "Applewold", "name": "model-142", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 187.518, "floor": 4, "navigation_complexity": 1.054, "room": 12, "ssa": 1.87}, "qual": 4}, "Arbutus": {"id": "Arbutus", "name": "model-4", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 53.595, "floor": 1, "navigation_complexity": 2.887, "room": 4, "ssa": 1.888}, "qual": 3}, "Archer": {"id": "Archer", "name": "model-433", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 281.014, "floor": 4, "navigation_complexity": 7.749, "room": 26, "ssa": 1.704}, "qual": 2}, "Arkansaw": {"id": "Arkansaw", "name": "model-555", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 385.425, "floor": 4, "navigation_complexity": 2.781, "room": 17, "ssa": 1.515}, "qual": 5}, "Arona": {"id": "Arona", "name": "model-16", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 62.098, "floor": 1, "navigation_complexity": 2.352, "room": 6, "ssa": 1.505}, "qual": 3}, "Artois": {"id": "Artois", "name": "model-334", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 100.399, "floor": 1, "navigation_complexity": 3.039, "room": 5, "ssa": 1.432}, "qual": 3}, "Ashport": {"id": "Ashport", "name": "model-202", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 573.641, "floor": 3, "navigation_complexity": 2.345, "room": 17, "ssa": 1.275}, "qual": 2}, "Assinippi": {"id": "Assinippi", "name": "model-211", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 365.543, "floor": 4, "navigation_complexity": 4.242, "room": 18, "ssa": 1.104}, "qual": 2}, "Athens": {"id": "Athens", "name": "model-258", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 440.851, "floor": 4, "navigation_complexity": 1.335, "room": 17, "ssa": 1.574}, "qual": 3}, "Auburn": {"id": "Auburn", "name": "model-121", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 430.311, "floor": 1, "navigation_complexity": 7.277, "room": 25, "ssa": 1.62}, "qual": 3}, "Aulander": {"id": "Aulander", "name": "model-406", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 95.115, "floor": 2, "navigation_complexity": 2.058, "room": 8, "ssa": 1.209}, "qual": 2}, "Avonia": {"id": "Avonia", "name": "model-464", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 59.6, "floor": 1, "navigation_complexity": 2.311, "room": 2, "ssa": 1.337}, "qual": 5}, "Azusa": {"id": "Azusa", "name": "model-343", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 507.047, "floor": 3, "navigation_complexity": 4.327, "room": 14, "ssa": 1.647}, "qual": 4}, "Badger": {"id": "Badger", "name": "model-422", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 144.427, "floor": 1, "navigation_complexity": 4.57, "room": 9, "ssa": 1.583}, "qual": 3}, "Ballantine": {"id": "Ballantine", "name": "model-65", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1153.639, "floor": 3, "navigation_complexity": 2.892, "room": 23, "ssa": 1.7}, "qual": 1}, "Ballou": {"id": "Ballou", "name": "model-11", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 533.456, "floor": 3, "navigation_complexity": 5.531, "room": 19, "ssa": 1.91}, "qual": 5}, "Baneberry": {"id": "Baneberry", "name": "model-63", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 660.51, "floor": 3, "navigation_complexity": 4.34, "room": 20, "ssa": 1.88}, "qual": 3}, "Barahona": {"id": "Barahona", "name": "model-233", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 57.115, "floor": 1, "navigation_complexity": 2.482, "room": 3, "ssa": 1.398}, "qual": 3}, "Barboursville": {"id": "Barboursville", "name": "model-380", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 61.422, "floor": 1, "navigation_complexity": 1.931, "room": 4, "ssa": 1.375}, "qual": 3}, "Barranquitas": {"id": "Barranquitas", "name": "model-388", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 476.868, "floor": 3, "navigation_complexity": 1.946, "room": 22, "ssa": 1.274}, "qual": 3}, "Bautista": {"id": "Bautista", "name": "model-391", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 305.479, "floor": 4, "navigation_complexity": 1.265, "room": 17, "ssa": 0.9}, "qual": 1}, "Beach": {"id": "Beach", "name": "model-356", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 38.23, "floor": 1, "navigation_complexity": 2.122, "room": 4, "ssa": 1.834}, "qual": 5}, "Beechwood": {"id": "Beechwood", "name": "model-101", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 464.624, "floor": 3, "navigation_complexity": 8.813, "room": 16, "ssa": 1.352}, "qual": 1}, "Bellemeade": {"id": "Bellemeade", "name": "model-303", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 70.337, "floor": 1, "navigation_complexity": 1.0, "room": 1, "ssa": 1.008}, "qual": 3}, "Bellwood": {"id": "Bellwood", "name": "model-403", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 657.554, "floor": 3, "navigation_complexity": 2.251, "room": 25, "ssa": 1.346}, "qual": 0}, "Belpre": {"id": "Belpre", "name": "model-474", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 584.831, "floor": 3, "navigation_complexity": 4.114, "room": 21, "ssa": 1.353}, "qual": 2}, "Benevolence": {"id": "Benevolence", "name": "model-424", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 176.734, "floor": 4, "navigation_complexity": 3.2, "room": 11, "ssa": 1.353}, "qual": 2}, "Benicia": {"id": "Benicia", "name": "model-91", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 83.084, "floor": 1, "navigation_complexity": 2.169, "room": 6, "ssa": 1.03}, "qual": 3}, "Bertram": {"id": "Bertram", "name": "model-550", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 172.167, "floor": 1, "navigation_complexity": 3.29, "room": 10, "ssa": 0.698}, "qual": 3}, "Bethlehem": {"id": "Bethlehem", "name": "model-204", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 603.61, "floor": 4, "navigation_complexity": 1.622, "room": 24, "ssa": 1.821}, "qual": 2}, "Bettendorf": {"id": "Bettendorf", "name": "model-289", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 4371.903, "floor": 6, "navigation_complexity": 3.36, "room": 21, "ssa": 0.493}, "qual": 0}, "Biltmore": {"id": "Biltmore", "name": "model-220", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 733.268, "floor": 1, "navigation_complexity": 2.707, "room": 38, "ssa": 1.156}, "qual": 0}, "Blackstone": {"id": "Blackstone", "name": "model-331", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 228.077, "floor": 1, "navigation_complexity": 1.845, "room": 11, "ssa": 1.443}, "qual": 1}, "Blenheim": {"id": "Blenheim", "name": "model-322", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 192.317, "floor": 2, "navigation_complexity": 3.54, "room": 7, "ssa": 1.68}, "qual": 3}, "Bohemia": {"id": "Bohemia", "name": "model-170", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 394.14, "floor": 3, "navigation_complexity": 1.583, "room": 14, "ssa": 1.665}, "qual": 1}, "Bolton": {"id": "Bolton", "name": "model-253", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 61.868, "floor": 1, "navigation_complexity": 2.596, "room": 4, "ssa": 1.568}, "qual": 4}, "Bonesteel": {"id": "Bonesteel", "name": "model-193", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 272.066, "floor": 3, "navigation_complexity": 2.257, "room": 16, "ssa": 1.413}, "qual": 3}, "Bonfield": {"id": "Bonfield", "name": "model-58", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 86.161, "floor": 1, "navigation_complexity": 4.318, "room": 6, "ssa": 1.229}, "qual": 4}, "Bonnie": {"id": "Bonnie", "name": "model-98", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 799.666, "floor": 4, "navigation_complexity": 4.395, "room": 18, "ssa": 1.372}, "qual": 2}, "Booth": {"id": "Booth", "name": "model-383", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 209.756, "floor": 4, "navigation_complexity": 1.149, "room": 17, "ssa": 1.212}, "qual": 2}, "Bountiful": {"id": "Bountiful", "name": "model-78", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 158.824, "floor": 3, "navigation_complexity": 3.094, "room": 12, "ssa": 1.732}, "qual": 2}, "Bowlus": {"id": "Bowlus", "name": "model-542", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 657.858, "floor": 4, "navigation_complexity": 2.104, "room": 26, "ssa": 1.28}, "qual": 4}, "Bowmore": {"id": "Bowmore", "name": "model-523", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1067.245, "floor": 5, "navigation_complexity": 1.38, "room": 31, "ssa": 1.687}, "qual": 0}, "Branford": {"id": "Branford", "name": "model-551", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 13.291, "floor": 1, "navigation_complexity": 1.136, "room": 4, "ssa": 1.391}, "qual": 2}, "Braxton": {"id": "Braxton", "name": "model-52", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 741.691, "floor": 4, "navigation_complexity": 3.866, "room": 20, "ssa": 1.791}, "qual": 1}, "Bremerton": {"id": "Bremerton", "name": "model-57", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 122.66, "floor": 1, "navigation_complexity": 2.852, "room": 5, "ssa": 1.229}, "qual": 2}, "Brentsville": {"id": "Brentsville", "name": "model-405", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 151.865, "floor": 1, "navigation_complexity": 3.072, "room": 10, "ssa": 1.209}, "qual": 4}, "Brevort": {"id": "Brevort", "name": "model-526", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 43.882, "floor": 1, "navigation_complexity": 2.915, "room": 5, "ssa": 2.061}, "qual": 5}, "Brewton": {"id": "Brewton", "name": "model-562", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 496.904, "floor": 3, "navigation_complexity": 4.949, "room": 14, "ssa": 1.02}, "qual": 2}, "Brinnon": {"id": "Brinnon", "name": "model-3", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 1171.902, "floor": 5, "navigation_complexity": 5.601, "room": 30, "ssa": 1.888}, "qual": 3}, "Broadwell": {"id": "Broadwell", "name": "model-569", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 457.211, "floor": 3, "navigation_complexity": 3.391, "room": 11, "ssa": 1.182}, "qual": 3}, "Broseley": {"id": "Broseley", "name": "model-285", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 782.684, "floor": 4, "navigation_complexity": 4.05, "room": 31, "ssa": 1.067}, "qual": 2}, "Brown": {"id": "Brown", "name": "model-316", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 809.315, "floor": 5, "navigation_complexity": 2.608, "room": 36, "ssa": 1.241}, "qual": 0}, "Browntown": {"id": "Browntown", "name": "model-518", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 837.527, "floor": 4, "navigation_complexity": 4.673, "room": 17, "ssa": 1.33}, "qual": 3}, "Burien": {"id": "Burien", "name": "model-396", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 485.288, "floor": 4, "navigation_complexity": 1.747, "room": 24, "ssa": 1.491}, "qual": 3}, "Bushong": {"id": "Bushong", "name": "model-373", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 3130.477, "floor": 2, "navigation_complexity": 6.76, "room": 57, "ssa": 2.062}, "qual": 0}, "Byers": {"id": "Byers", "name": "model-324", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 217.218, "floor": 4, "navigation_complexity": 1.693, "room": 17, "ssa": 1.203}, "qual": 2}, "Cabin": {"id": "Cabin", "name": "model-100", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 155.706, "floor": 1, "navigation_complexity": 1.0, "room": 9, "ssa": 1.027}, "qual": 2}, "Calavo": {"id": "Calavo", "name": "model-295", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 239.244, "floor": 1, "navigation_complexity": 5.611, "room": 15, "ssa": 1.593}, "qual": 3}, "Callicoon": {"id": "Callicoon", "name": "model-446", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 581.598, "floor": 3, "navigation_complexity": 3.054, "room": 21, "ssa": 1.375}, "qual": 0}, "Calmar": {"id": "Calmar", "name": "model-498", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 337.188, "floor": 3, "navigation_complexity": 1.718, "room": 10, "ssa": 1.524}, "qual": 3}, "Cantwell": {"id": "Cantwell", "name": "model-317", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 107.582, "floor": 1, "navigation_complexity": 1.016, "room": 8, "ssa": 1.309}, "qual": 4}, "Capistrano": {"id": "Capistrano", "name": "model-226", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 261.927, "floor": 3, "navigation_complexity": 1.145, "room": 12, "ssa": 1.526}, "qual": 5}, "Carneiro": {"id": "Carneiro", "name": "model-397", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 130.541, "floor": 3, "navigation_complexity": 5.664, "room": 2, "ssa": 1.764}, "qual": 2}, "Carpendale": {"id": "Carpendale", "name": "model-451", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 225.156, "floor": 1, "navigation_complexity": 3.424, "room": 13, "ssa": 1.415}, "qual": 2}, "Carpio": {"id": "Carpio", "name": "model-82", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 70.216, "floor": 1, "navigation_complexity": 1.436, "room": 9, "ssa": 1.617}, "qual": 3}, "Caruthers": {"id": "Caruthers", "name": "model-33", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 129.048, "floor": 1, "navigation_complexity": 2.878, "room": 9, "ssa": 1.533}, "qual": 4}, "Cashel": {"id": "Cashel", "name": "model-366", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 223.371, "floor": 3, "navigation_complexity": 1.015, "room": 13, "ssa": 0.988}, "qual": 2}, "Cason": {"id": "Cason", "name": "model-302", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 59.6, "floor": 1, "navigation_complexity": 2.709, "room": 3, "ssa": 1.153}, "qual": 1}, "Castor": {"id": "Castor", "name": "model-9", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 151.967, "floor": 3, "navigation_complexity": 2.109, "room": 9, "ssa": 1.577}, "qual": 2}, "Castroville": {"id": "Castroville", "name": "model-231", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 410.224, "floor": 3, "navigation_complexity": 1.747, "room": 13, "ssa": 1.043}, "qual": 3}, "Cauthron": {"id": "Cauthron", "name": "model-435", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 628.857, "floor": 3, "navigation_complexity": 5.43, "room": 16, "ssa": 1.309}, "qual": 3}, "Cayuse": {"id": "Cayuse", "name": "model-133", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 465.267, "floor": 4, "navigation_complexity": 1.001, "room": 30, "ssa": 1.956}, "qual": 1}, "Cebolla": {"id": "Cebolla", "name": "model-116", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 636.509, "floor": 4, "navigation_complexity": 3.503, "room": 19, "ssa": 1.613}, "qual": 1}, "Channel": {"id": "Channel", "name": "model-206", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1234.302, "floor": 5, "navigation_complexity": 2.001, "room": 45, "ssa": 1.483}, "qual": 1}, "Checotah": {"id": "Checotah", "name": "model-199", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 64.944, "floor": 2, "navigation_complexity": 1.697, "room": 6, "ssa": 1.546}, "qual": 2}, "Chesaning": {"id": "Chesaning", "name": "model-246", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 3351.796, "floor": 3, "navigation_complexity": 5.092, "room": 20, "ssa": 0.498}, "qual": 0}, "Chesterbrook": {"id": "Chesterbrook", "name": "model-171", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 551.319, "floor": 3, "navigation_complexity": 1.236, "room": 21, "ssa": 0.498}, "qual": 2}, "Chilhowie": {"id": "Chilhowie", "name": "model-40", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 398.26, "floor": 3, "navigation_complexity": 5.436, "room": 11, "ssa": 1.655}, "qual": 3}, "Chiloquin": {"id": "Chiloquin", "name": "model-143", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 621.968, "floor": 3, "navigation_complexity": 3.402, "room": 22, "ssa": 1.134}, "qual": 1}, "Chireno": {"id": "Chireno", "name": "model-56", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 152.69, "floor": 1, "navigation_complexity": 4.067, "room": 9, "ssa": 1.931}, "qual": 3}, "Chrisney": {"id": "Chrisney", "name": "model-47", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 123.959, "floor": 3, "navigation_complexity": 1.313, "room": 7, "ssa": 1.241}, "qual": 3}, "Churchton": {"id": "Churchton", "name": "model-124", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 275.265, "floor": 4, "navigation_complexity": 1.346, "room": 17, "ssa": 1.187}, "qual": 2}, "Circleville": {"id": "Circleville", "name": "model-156", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 109.787, "floor": 2, "navigation_complexity": 1.597, "room": 8, "ssa": 1.661}, "qual": 2}, "Cisne": {"id": "Cisne", "name": "model-351", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 230.839, "floor": 3, "navigation_complexity": 1.552, "room": 17, "ssa": 2.031}, "qual": 3}, "Clairton": {"id": "Clairton", "name": "model-434", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 111.607, "floor": 2, "navigation_complexity": 2.647, "room": 7, "ssa": 1.704}, "qual": 3}, "Clarkridge": {"id": "Clarkridge", "name": "model-165", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 920.049, "floor": 4, "navigation_complexity": 1.165, "room": 17, "ssa": 1.277}, "qual": 2}, "Clive": {"id": "Clive", "name": "model-50", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 594.258, "floor": 5, "navigation_complexity": 2.108, "room": 21, "ssa": 1.151}, "qual": 3}, "Cobalt": {"id": "Cobalt", "name": "model-328", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 324.923, "floor": 3, "navigation_complexity": 5.297, "room": 9, "ssa": 1.886}, "qual": 1}, "Cochranton": {"id": "Cochranton", "name": "model-442", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 278.634, "floor": 4, "navigation_complexity": 2.363, "room": 16, "ssa": 2.049}, "qual": 3}, "Codell": {"id": "Codell", "name": "model-46", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 526.822, "floor": 3, "navigation_complexity": 7.747, "room": 21, "ssa": 1.847}, "qual": 2}, "Coeburn": {"id": "Coeburn", "name": "model-570", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 104.971, "floor": 2, "navigation_complexity": 1.516, "room": 7, "ssa": 1.182}, "qual": 3}, "Coffeen": {"id": "Coffeen", "name": "model-148", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 286.735, "floor": 5, "navigation_complexity": 2.238, "room": 17, "ssa": 1.837}, "qual": 1}, "Cohoes": {"id": "Cohoes", "name": "model-276", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 521.686, "floor": 3, "navigation_complexity": 1.257, "room": 21, "ssa": 0.939}, "qual": 2}, "Cokeville": {"id": "Cokeville", "name": "model-139", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 103.567, "floor": 2, "navigation_complexity": 3.452, "room": 8, "ssa": 1.152}, "qual": 2}, "Colebrook": {"id": "Colebrook", "name": "model-515", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 87.088, "floor": 1, "navigation_complexity": 4.266, "room": 7, "ssa": 1.635}, "qual": 5}, "Collierville": {"id": "Collierville", "name": "model-323", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "val", "stats": {"area": 171.335, "floor": 4, "navigation_complexity": 3.944, "room": 12, "ssa": 1.203}, "qual": 3}, "Connellsville": {"id": "Connellsville", "name": "model-502", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 64.072, "floor": 1, "navigation_complexity": 2.101, "room": 6, "ssa": 1.422}, "qual": 4}, "Connoquenessing": {"id": "Connoquenessing", "name": "model-64", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 568.497, "floor": 4, "navigation_complexity": 3.705, "room": 14, "ssa": 1.88}, "qual": 2}, "Convoy": {"id": "Convoy", "name": "model-36", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 19.183, "floor": 1, "navigation_complexity": 1.0, "room": 2, "ssa": 1.46}, "qual": 5}, "Cooperstown": {"id": "Cooperstown", "name": "model-247", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 40.268, "floor": 1, "navigation_complexity": 1.341, "room": 3, "ssa": 1.725}, "qual": 5}, "Copemish": {"id": "Copemish", "name": "model-125", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 411.203, "floor": 3, "navigation_complexity": 3.653, "room": 14, "ssa": 1.182}, "qual": 1}, "Corder": {"id": "Corder", "name": "model-159", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 99.565, "floor": 1, "navigation_complexity": 1.354, "room": 2, "ssa": 1.113}, "qual": 0}, "Cornville": {"id": "Cornville", "name": "model-168", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 360.365, "floor": 3, "navigation_complexity": 3.89, "room": 14, "ssa": 1.179}, "qual": 3}, "Coronado": {"id": "Coronado", "name": "model-259", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 321.079, "floor": 1, "navigation_complexity": 4.153, "room": 22, "ssa": 1.251}, "qual": 2}, "Corozal": {"id": "Corozal", "name": "model-296", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "val", "stats": {"area": 577.136, "floor": 3, "navigation_complexity": 10.038, "room": 16, "ssa": 1.593}, "qual": 3}, "Cosmos": {"id": "Cosmos", "name": "model-368", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 371.618, "floor": 3, "navigation_complexity": 8.758, "room": 17, "ssa": 1.332}, "qual": 2}, "Cottonport": {"id": "Cottonport", "name": "model-25", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 776.583, "floor": 3, "navigation_complexity": 6.634, "room": 25, "ssa": 1.924}, "qual": 3}, "Couderay": {"id": "Couderay", "name": "model-341", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 214.625, "floor": 4, "navigation_complexity": 1.752, "room": 15, "ssa": 1.144}, "qual": 3}, "Country": {"id": "Country", "name": "model-463", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 730.187, "floor": 3, "navigation_complexity": 8.213, "room": 26, "ssa": 1.337}, "qual": 2}, "Cousins": {"id": "Cousins", "name": "model-75", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 660.449, "floor": 4, "navigation_complexity": 3.821, "room": 19, "ssa": 1.203}, "qual": 2}, "Crandon": {"id": "Crandon", "name": "model-112", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 298.35, "floor": 3, "navigation_complexity": 3.632, "room": 13, "ssa": 1.609}, "qual": 4}, "Creede": {"id": "Creede", "name": "model-14", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 81.331, "floor": 1, "navigation_complexity": 2.931, "room": 6, "ssa": 1.207}, "qual": 3}, "Crookston": {"id": "Crookston", "name": "model-517", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1135.121, "floor": 3, "navigation_complexity": 5.306, "room": 17, "ssa": 1.33}, "qual": 2}, "Crugers": {"id": "Crugers", "name": "model-453", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 2019.115, "floor": 4, "navigation_complexity": 2.385, "room": 36, "ssa": 1.297}, "qual": 2}, "Culbertson": {"id": "Culbertson", "name": "model-408", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 6.34, "floor": 1, "navigation_complexity": 1.0, "room": 1, "ssa": 1.342}, "qual": 3}, "Cullison": {"id": "Cullison", "name": "model-320", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 590.424, "floor": 4, "navigation_complexity": 1.003, "room": 26, "ssa": 1.593}, "qual": 3}, "Cutlerville": {"id": "Cutlerville", "name": "model-104", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 776.972, "floor": 3, "navigation_complexity": 2.934, "room": 21, "ssa": 1.179}, "qual": 3}, "Dalcour": {"id": "Dalcour", "name": "model-560", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 177.415, "floor": 1, "navigation_complexity": 3.542, "room": 11, "ssa": 0.678}, "qual": 3}, "Dansville": {"id": "Dansville", "name": "model-241", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 457.536, "floor": 4, "navigation_complexity": 2.762, "room": 21, "ssa": 1.094}, "qual": 3}, "Darden": {"id": "Darden", "name": "model-556", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "val", "stats": {"area": 444.113, "floor": 4, "navigation_complexity": 7.162, "room": 27, "ssa": 1.515}, "qual": 3}, "Darnestown": {"id": "Darnestown", "name": "model-169", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 444.709, "floor": 1, "navigation_complexity": 5.051, "room": 3, "ssa": 1.665}, "qual": 2}, "Darrtown": {"id": "Darrtown", "name": "model-41", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 335.329, "floor": 4, "navigation_complexity": 1.84, "room": 12, "ssa": 1.975}, "qual": 2}, "Dauberville": {"id": "Dauberville", "name": "model-119", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 265.929, "floor": 3, "navigation_complexity": 2.347, "room": 9, "ssa": 1.431}, "qual": 4}, "Deatsville": {"id": "Deatsville", "name": "model-311", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 932.826, "floor": 4, "navigation_complexity": 5.574, "room": 12, "ssa": 1.625}, "qual": 3}, "Dedham": {"id": "Dedham", "name": "model-155", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 593.908, "floor": 3, "navigation_complexity": 4.32, "room": 27, "ssa": 1.661}, "qual": 2}, "Deemston": {"id": "Deemston", "name": "model-376", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 366.947, "floor": 3, "navigation_complexity": 1.879, "room": 15, "ssa": 1.8}, "qual": 2}, "Delton": {"id": "Delton", "name": "model-352", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 86.226, "floor": 1, "navigation_complexity": 4.598, "room": 1, "ssa": 2.031}, "qual": 4}, "Denmark": {"id": "Denmark", "name": "model-369", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 40.796, "floor": 1, "navigation_complexity": 1.81, "room": 2, "ssa": 1.637}, "qual": 4}, "Destin": {"id": "Destin", "name": "model-160", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 95.079, "floor": 1, "navigation_complexity": 3.581, "room": 4, "ssa": 1.113}, "qual": 3}, "Divide": {"id": "Divide", "name": "model-132", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 70.734, "floor": 1, "navigation_complexity": 4.798, "room": 4, "ssa": 1.174}, "qual": 3}, "Donaldson": {"id": "Donaldson", "name": "model-207", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 1178.619, "floor": 4, "navigation_complexity": 5.979, "room": 26, "ssa": 1.069}, "qual": 1}, "Dryville": {"id": "Dryville", "name": "model-481", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 87.753, "floor": 1, "navigation_complexity": 2.528, "room": 8, "ssa": 1.256}, "qual": 4}, "Duarte": {"id": "Duarte", "name": "model-469", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 557.685, "floor": 4, "navigation_complexity": 7.211, "room": 22, "ssa": 1.238}, "qual": 3}, "Duluth": {"id": "Duluth", "name": "model-244", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1254.255, "floor": 4, "navigation_complexity": 2.656, "room": 39, "ssa": 1.105}, "qual": 2}, "Dunmor": {"id": "Dunmor", "name": "model-554", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 90.056, "floor": 1, "navigation_complexity": 4.075, "room": 7, "ssa": 1.074}, "qual": 4}, "Eagan": {"id": "Eagan", "name": "model-15", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 326.346, "floor": 4, "navigation_complexity": 1.224, "room": 17, "ssa": 1.505}, "qual": 3}, "Eagerville": {"id": "Eagerville", "name": "model-565", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 48.031, "floor": 1, "navigation_complexity": 1.006, "room": 4, "ssa": 1.946}, "qual": 4}, "Eastville": {"id": "Eastville", "name": "model-538", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 121.44, "floor": 1, "navigation_complexity": 3.518, "room": 6, "ssa": 1.347}, "qual": 4}, "Edgemere": {"id": "Edgemere", "name": "model-177", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 23.62, "floor": 2, "navigation_complexity": 1.262, "room": 3, "ssa": 0.916}, "qual": 4}, "Edson": {"id": "Edson", "name": "model-214", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 484.365, "floor": 3, "navigation_complexity": 2.561, "room": 12, "ssa": 1.433}, "qual": 3}, "Ellaville": {"id": "Ellaville", "name": "model-38", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 396.693, "floor": 3, "navigation_complexity": 1.347, "room": 16, "ssa": 0.936}, "qual": 3}, "Elmira": {"id": "Elmira", "name": "model-345", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 42.795, "floor": 1, "navigation_complexity": 1.431, "room": 3, "ssa": 2.177}, "qual": 4}, "Elton": {"id": "Elton", "name": "model-256", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 303.378, "floor": 1, "navigation_complexity": 3.898, "room": 2, "ssa": 2.173}, "qual": 2}, "Emmaus": {"id": "Emmaus", "name": "model-425", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 871.08, "floor": 3, "navigation_complexity": 6.55, "room": 25, "ssa": 1.189}, "qual": 3}, "Espanola": {"id": "Espanola", "name": "model-6", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 62.127, "floor": 1, "navigation_complexity": 2.144, "room": 3, "ssa": 0.843}, "qual": 4}, "Eudora": {"id": "Eudora", "name": "model-128", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 37.079, "floor": 1, "navigation_complexity": 2.897, "room": 3, "ssa": 1.161}, "qual": 4}, "Euharlee": {"id": "Euharlee", "name": "model-31", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 67.995, "floor": 1, "navigation_complexity": 4.028, "room": 1, "ssa": 2.07}, "qual": 3}, "Everton": {"id": "Everton", "name": "model-126", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 266.079, "floor": 3, "navigation_complexity": 2.129, "room": 12, "ssa": 1.182}, "qual": 3}, "Ewansville": {"id": "Ewansville", "name": "model-232", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 335.837, "floor": 3, "navigation_complexity": 2.012, "room": 19, "ssa": 1.043}, "qual": 4}, "Ewell": {"id": "Ewell", "name": "model-472", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 767.007, "floor": 4, "navigation_complexity": 2.169, "room": 29, "ssa": 1.149}, "qual": 3}, "Experiment": {"id": "Experiment", "name": "model-420", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 258.674, "floor": 3, "navigation_complexity": 1.024, "room": 11, "ssa": 1.208}, "qual": 3}, "Fedora": {"id": "Fedora", "name": "model-267", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 909.584, "floor": 5, "navigation_complexity": 2.735, "room": 23, "ssa": 1.44}, "qual": 2}, "Fishersville": {"id": "Fishersville", "name": "model-568", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 477.047, "floor": 3, "navigation_complexity": 2.513, "room": 16, "ssa": 1.851}, "qual": 1}, "Fitchburg": {"id": "Fitchburg", "name": "model-527", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 319.542, "floor": 3, "navigation_complexity": 1.017, "room": 14, "ssa": 1.601}, "qual": 1}, "Fleming": {"id": "Fleming", "name": "model-191", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 258.626, "floor": 4, "navigation_complexity": 1.041, "room": 15, "ssa": 1.195}, "qual": 2}, "Fonda": {"id": "Fonda", "name": "model-215", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 560.051, "floor": 4, "navigation_complexity": 2.058, "room": 18, "ssa": 1.992}, "qual": 1}, "Forkland": {"id": "Forkland", "name": "model-499", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 300.418, "floor": 4, "navigation_complexity": 3.539, "room": 27, "ssa": 1.256}, "qual": 2}, "Foyil": {"id": "Foyil", "name": "model-13", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 90.196, "floor": 2, "navigation_complexity": 2.181, "room": 7, "ssa": 1.207}, "qual": 1}, "Frankfort": {"id": "Frankfort", "name": "model-107", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 345.828, "floor": 3, "navigation_complexity": 5.354, "room": 10, "ssa": 1.647}, "qual": 1}, "Frankton": {"id": "Frankton", "name": "model-333", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1018.387, "floor": 1, "navigation_complexity": 2.086, "room": 30, "ssa": 1.432}, "qual": 0}, "Fredericksburg": {"id": "Fredericksburg", "name": "model-401", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 149.214, "floor": 1, "navigation_complexity": 1.511, "room": 2, "ssa": 0.872}, "qual": 1}, "Freedom": {"id": "Freedom", "name": "model-539", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 642.418, "floor": 3, "navigation_complexity": 2.148, "room": 19, "ssa": 1.641}, "qual": 2}, "Frierson": {"id": "Frierson", "name": "model-387", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 288.293, "floor": 3, "navigation_complexity": 1.398, "room": 16, "ssa": 1.274}, "qual": 3}, "Frontenac": {"id": "Frontenac", "name": "model-461", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 506.124, "floor": 4, "navigation_complexity": 5.005, "room": 21, "ssa": 1.405}, "qual": 1}, "Funkstown": {"id": "Funkstown", "name": "model-186", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 629.038, "floor": 3, "navigation_complexity": 2.827, "room": 15, "ssa": 1.442}, "qual": 2}, "Galatia": {"id": "Galatia", "name": "model-359", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 76.024, "floor": 1, "navigation_complexity": 2.863, "room": 5, "ssa": 1.212}, "qual": 3}, "Gasburg": {"id": "Gasburg", "name": "model-252", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 62.01, "floor": 1, "navigation_complexity": 1.708, "room": 3, "ssa": 1.494}, "qual": 3}, "Gastonia": {"id": "Gastonia", "name": "model-385", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 330.179, "floor": 3, "navigation_complexity": 2.527, "room": 15, "ssa": 1.787}, "qual": 2}, "Gaylord": {"id": "Gaylord", "name": "model-150", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 59.25, "floor": 1, "navigation_complexity": 3.38, "room": 5, "ssa": 1.339}, "qual": 3}, "German": {"id": "German", "name": "model-466", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 1050.351, "floor": 3, "navigation_complexity": 9.619, "room": 7, "ssa": 2.115}, "qual": 1}, "Germfask": {"id": "Germfask", "name": "model-520", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 43.875, "floor": 1, "navigation_complexity": 3.079, "room": 5, "ssa": 1.536}, "qual": 3}, "Gilbert": {"id": "Gilbert", "name": "model-110", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 221.741, "floor": 3, "navigation_complexity": 1.567, "room": 2, "ssa": 1.238}, "qual": 2}, "Gladstone": {"id": "Gladstone", "name": "model-167", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 337.942, "floor": 4, "navigation_complexity": 1.371, "room": 18, "ssa": 1.179}, "qual": 3}, "Glassboro": {"id": "Glassboro", "name": "model-438", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1089.73, "floor": 2, "navigation_complexity": 2.243, "room": 21, "ssa": 1.46}, "qual": 2}, "Glenmoor": {"id": "Glenmoor", "name": "model-213", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 448.802, "floor": 3, "navigation_complexity": 6.722, "room": 17, "ssa": 1.433}, "qual": 2}, "Globe": {"id": "Globe", "name": "model-381", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 359.482, "floor": 4, "navigation_complexity": 5.787, "room": 20, "ssa": 1.267}, "qual": 2}, "Gloria": {"id": "Gloria", "name": "model-182", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 348.101, "floor": 3, "navigation_complexity": 5.28, "room": 7, "ssa": 1.56}, "qual": 2}, "Gluck": {"id": "Gluck", "name": "model-497", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 74.119, "floor": 1, "navigation_complexity": 1.204, "room": 7, "ssa": 1.524}, "qual": 2}, "Gluek": {"id": "Gluek", "name": "model-290", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 740.584, "floor": 7, "navigation_complexity": 3.62, "room": 12, "ssa": 0.493}, "qual": 1}, "Goffs": {"id": "Goffs", "name": "model-374", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 482.09, "floor": 3, "navigation_complexity": 5.257, "room": 17, "ssa": 2.062}, "qual": 4}, "Goodfield": {"id": "Goodfield", "name": "model-5", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 348.422, "floor": 4, "navigation_complexity": 3.365, "room": 16, "ssa": 0.843}, "qual": 3}, "Goodview": {"id": "Goodview", "name": "model-284", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 104.041, "floor": 1, "navigation_complexity": 2.423, "room": 7, "ssa": 1.403}, "qual": 3}, "Goodwine": {"id": "Goodwine", "name": "model-309", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 285.005, "floor": 3, "navigation_complexity": 3.3, "room": 16, "ssa": 1.083}, "qual": 3}, "Goodyear": {"id": "Goodyear", "name": "model-114", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 367.827, "floor": 4, "navigation_complexity": 1.041, "room": 16, "ssa": 1.969}, "qual": 2}, "Gough": {"id": "Gough", "name": "model-99", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 918.933, "floor": 5, "navigation_complexity": 3.98, "room": 23, "ssa": 1.027}, "qual": 1}, "Grace": {"id": "Grace", "name": "model-59", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1303.692, "floor": 3, "navigation_complexity": 4.095, "room": 21, "ssa": 1.165}, "qual": 0}, "Graceville": {"id": "Graceville", "name": "model-521", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 359.458, "floor": 3, "navigation_complexity": 1.676, "room": 15, "ssa": 1.96}, "qual": 3}, "Gracey": {"id": "Gracey", "name": "model-375", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1958.748, "floor": 1, "navigation_complexity": 8.073, "room": 52, "ssa": 1.8}, "qual": 1}, "Grainola": {"id": "Grainola", "name": "model-149", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 2363.378, "floor": 3, "navigation_complexity": 23.015, "room": 39, "ssa": 1.339}, "qual": 1}, "Grangeville": {"id": "Grangeville", "name": "model-510", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 376.558, "floor": 4, "navigation_complexity": 1.31, "room": 20, "ssa": 1.223}, "qual": 3}, "Grantsville": {"id": "Grantsville", "name": "model-468", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 547.068, "floor": 4, "navigation_complexity": 1.001, "room": 20, "ssa": 1.282}, "qual": 3}, "Grassy": {"id": "Grassy", "name": "model-89", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1626.834, "floor": 3, "navigation_complexity": 2.643, "room": 29, "ssa": 0.731}, "qual": 1}, "Gratz": {"id": "Gratz", "name": "model-512", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 861.946, "floor": 1, "navigation_complexity": 1.047, "room": 2, "ssa": 0.873}, "qual": 1}, "Gravelly": {"id": "Gravelly", "name": "model-219", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 530.688, "floor": 3, "navigation_complexity": 5.877, "room": 27, "ssa": 1.156}, "qual": 3}, "Greigsville": {"id": "Greigsville", "name": "model-308", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 43.603, "floor": 1, "navigation_complexity": 1.847, "room": 2, "ssa": 1.406}, "qual": 4}, "Grigston": {"id": "Grigston", "name": "model-360", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 67.84, "floor": 1, "navigation_complexity": 1.169, "room": 2, "ssa": 1.212}, "qual": 2}, "Haaswood": {"id": "Haaswood", "name": "model-30", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 537.735, "floor": 5, "navigation_complexity": 2.232, "room": 19, "ssa": 1.673}, "qual": 1}, "Hacienda": {"id": "Hacienda", "name": "model-35", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 437.136, "floor": 3, "navigation_complexity": 3.863, "room": 16, "ssa": 1.46}, "qual": 2}, "Hainesburg": {"id": "Hainesburg", "name": "model-511", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 275.235, "floor": 3, "navigation_complexity": 3.244, "room": 15, "ssa": 0.873}, "qual": 4}, "Halfway": {"id": "Halfway", "name": "model-61", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 185.459, "floor": 3, "navigation_complexity": 1.118, "room": 10, "ssa": 1.643}, "qual": 3}, "Hallettsville": {"id": "Hallettsville", "name": "model-399", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 670.759, "floor": 3, "navigation_complexity": 1.531, "room": 18, "ssa": 1.341}, "qual": 3}, "Hambleton": {"id": "Hambleton", "name": "model-272", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 67.901, "floor": 1, "navigation_complexity": 1.0, "room": 5, "ssa": 1.363}, "qual": 4}, "Hammon": {"id": "Hammon", "name": "model-81", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 608.869, "floor": 3, "navigation_complexity": 4.273, "room": 19, "ssa": 1.617}, "qual": 2}, "Hanson": {"id": "Hanson", "name": "model-1", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 343.461, "floor": 4, "navigation_complexity": 9.4, "room": 21, "ssa": 1.552}, "qual": 2}, "Harkeyville": {"id": "Harkeyville", "name": "model-564", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1374.661, "floor": 3, "navigation_complexity": 3.107, "room": 2, "ssa": 1.607}, "qual": 2}, "Harrellsville": {"id": "Harrellsville", "name": "model-164", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 61.215, "floor": 1, "navigation_complexity": 3.886, "room": 5, "ssa": 1.553}, "qual": 3}, "Hartline": {"id": "Hartline", "name": "model-201", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 727.746, "floor": 3, "navigation_complexity": 3.41, "room": 21, "ssa": 1.275}, "qual": 1}, "Hatfield": {"id": "Hatfield", "name": "model-153", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 73.469, "floor": 1, "navigation_complexity": 3.17, "room": 5, "ssa": 1.008}, "qual": 3}, "Haxtun": {"id": "Haxtun", "name": "model-23", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 57.408, "floor": 1, "navigation_complexity": 2.184, "room": 5, "ssa": 2.077}, "qual": 4}, "Haymarket": {"id": "Haymarket", "name": "model-516", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 456.154, "floor": 3, "navigation_complexity": 5.456, "room": 21, "ssa": 1.635}, "qual": 2}, "Helix": {"id": "Helix", "name": "model-140", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 61.886, "floor": 1, "navigation_complexity": 2.504, "room": 4, "ssa": 1.152}, "qual": 2}, "Helton": {"id": "Helton", "name": "model-349", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 310.883, "floor": 3, "navigation_complexity": 5.192, "room": 11, "ssa": 0.81}, "qual": 3}, "Hendrix": {"id": "Hendrix", "name": "model-66", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 659.372, "floor": 3, "navigation_complexity": 4.787, "room": 17, "ssa": 1.7}, "qual": 3}, "Hennepin": {"id": "Hennepin", "name": "model-275", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1955.06, "floor": 3, "navigation_complexity": 2.058, "room": 3, "ssa": 0.939}, "qual": 2}, "Hercules": {"id": "Hercules", "name": "model-537", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 119.162, "floor": 1, "navigation_complexity": 3.477, "room": 8, "ssa": 1.347}, "qual": 2}, "Herricks": {"id": "Herricks", "name": "model-141", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 273.683, "floor": 1, "navigation_complexity": 1.117, "room": 17, "ssa": 1.87}, "qual": 2}, "Highspire": {"id": "Highspire", "name": "model-428", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 214.799, "floor": 4, "navigation_complexity": 1.715, "room": 13, "ssa": 1.645}, "qual": 3}, "Hildebran": {"id": "Hildebran", "name": "model-326", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 96.343, "floor": 1, "navigation_complexity": 4.173, "room": 6, "ssa": 1.9}, "qual": 3}, "Hillsdale": {"id": "Hillsdale", "name": "model-178", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 146.903, "floor": 3, "navigation_complexity": 3.099, "room": 2, "ssa": 0.916}, "qual": 4}, "Hindsboro": {"id": "Hindsboro", "name": "model-281", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1581.764, "floor": 4, "navigation_complexity": 6.659, "room": 3, "ssa": 1.21}, "qual": 2}, "Hitchland": {"id": "Hitchland", "name": "model-106", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 636.256, "floor": 4, "navigation_complexity": 5.794, "room": 24, "ssa": 2.064}, "qual": 2}, "Hiteman": {"id": "Hiteman", "name": "model-358", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 282.095, "floor": 4, "navigation_complexity": 6.118, "room": 19, "ssa": 1.678}, "qual": 2}, "Hobson": {"id": "Hobson", "name": "model-208", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 125.348, "floor": 1, "navigation_complexity": 3.187, "room": 9, "ssa": 1.069}, "qual": 2}, "Holcut": {"id": "Holcut", "name": "model-488", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1064.191, "floor": 2, "navigation_complexity": 3.567, "room": 8, "ssa": 0.823}, "qual": 2}, "Hometown": {"id": "Hometown", "name": "model-530", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 61.282, "floor": 1, "navigation_complexity": 2.189, "room": 4, "ssa": 0.938}, "qual": 4}, "Hominy": {"id": "Hominy", "name": "model-105", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 143.953, "floor": 3, "navigation_complexity": 1.518, "room": 10, "ssa": 2.064}, "qual": 4}, "Hordville": {"id": "Hordville", "name": "model-273", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 205.509, "floor": 4, "navigation_complexity": 1.003, "room": 16, "ssa": 1.247}, "qual": 2}, "Hornsby": {"id": "Hornsby", "name": "model-279", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 307.377, "floor": 3, "navigation_complexity": 3.045, "room": 18, "ssa": 1.946}, "qual": 2}, "Hortense": {"id": "Hortense", "name": "model-67", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 172.84, "floor": 4, "navigation_complexity": 1.0, "room": 15, "ssa": 2.049}, "qual": 3}, "Howie": {"id": "Howie", "name": "model-109", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 570.3, "floor": 3, "navigation_complexity": 3.227, "room": 22, "ssa": 1.238}, "qual": 3}, "Hurley": {"id": "Hurley", "name": "model-450", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 844.897, "floor": 4, "navigation_complexity": 3.365, "room": 23, "ssa": 1.353}, "qual": 2}, "Idanha": {"id": "Idanha", "name": "model-71", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 40.383, "floor": 1, "navigation_complexity": 1.407, "room": 1, "ssa": 2.525}, "qual": 2}, "Ihlen": {"id": "Ihlen", "name": "model-32", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "test", "stats": {"area": 359.128, "floor": 3, "navigation_complexity": 2.194, "room": 17, "ssa": 2.07}, "qual": 2}, "Imbery": {"id": "Imbery", "name": "model-113", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 425.898, "floor": 3, "navigation_complexity": 1.899, "room": 12, "ssa": 1.969}, "qual": 2}, "Inkom": {"id": "Inkom", "name": "model-261", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 69.349, "floor": 1, "navigation_complexity": 1.835, "room": 6, "ssa": 1.438}, "qual": 3}, "Irvine": {"id": "Irvine", "name": "model-291", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 544.508, "floor": 4, "navigation_complexity": 1.473, "room": 15, "ssa": 1.271}, "qual": 3}, "Islandton": {"id": "Islandton", "name": "model-572", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 280.636, "floor": 1, "navigation_complexity": 5.436, "room": 9, "ssa": 1.402}, "qual": 2}, "Jacobus": {"id": "Jacobus", "name": "model-327", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 550.278, "floor": 6, "navigation_complexity": 3.357, "room": 8, "ssa": 1.886}, "qual": 2}, "Jenners": {"id": "Jenners", "name": "model-437", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 590.746, "floor": 3, "navigation_complexity": 4.269, "room": 20, "ssa": 1.46}, "qual": 2}, "Jennie": {"id": "Jennie", "name": "model-287", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 426.996, "floor": 4, "navigation_complexity": 1.0, "room": 29, "ssa": 1.781}, "qual": 2}, "Judith": {"id": "Judith", "name": "model-344", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1397.48, "floor": 3, "navigation_complexity": 2.011, "room": 25, "ssa": 1.647}, "qual": 1}, "Kangley": {"id": "Kangley", "name": "model-223", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 557.674, "floor": 3, "navigation_complexity": 1.695, "room": 16, "ssa": 1.356}, "qual": 2}, "Kankakee": {"id": "Kankakee", "name": "model-432", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 434.315, "floor": 4, "navigation_complexity": 2.273, "room": 22, "ssa": 1.743}, "qual": 3}, "Kathryn": {"id": "Kathryn", "name": "model-147", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 649.14, "floor": 3, "navigation_complexity": 3.569, "room": 15, "ssa": 1.837}, "qual": 1}, "Keiser": {"id": "Keiser", "name": "model-346", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 396.175, "floor": 4, "navigation_complexity": 1.745, "room": 22, "ssa": 2.177}, "qual": 3}, "Kemblesville": {"id": "Kemblesville", "name": "model-176", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 67.226, "floor": 1, "navigation_complexity": 4.037, "room": 6, "ssa": 1.091}, "qual": 2}, "Kendall": {"id": "Kendall", "name": "model-254", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 303.213, "floor": 1, "navigation_complexity": 2.31, "room": 12, "ssa": 1.568}, "qual": 3}, "Kerrtown": {"id": "Kerrtown", "name": "model-217", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 124.735, "floor": 1, "navigation_complexity": 2.876, "room": 7, "ssa": 1.303}, "qual": 4}, "Kettle": {"id": "Kettle", "name": "model-382", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 90.146, "floor": 3, "navigation_complexity": 1.005, "room": 10, "ssa": 1.267}, "qual": 2}, "Kevin": {"id": "Kevin", "name": "model-234", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 505.398, "floor": 3, "navigation_complexity": 1.71, "room": 21, "ssa": 1.398}, "qual": 1}, "Keweenaw": {"id": "Keweenaw", "name": "model-507", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 314.952, "floor": 1, "navigation_complexity": 1.352, "room": 24, "ssa": 1.126}, "qual": 2}, "Kihei": {"id": "Kihei", "name": "model-278", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 323.7, "floor": 3, "navigation_complexity": 1.519, "room": 10, "ssa": 1.664}, "qual": 2}, "Kildare": {"id": "Kildare", "name": "model-325", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 143.509, "floor": 3, "navigation_complexity": 1.062, "room": 11, "ssa": 1.9}, "qual": 2}, "Kinde": {"id": "Kinde", "name": "model-158", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 402.563, "floor": 3, "navigation_complexity": 2.058, "room": 19, "ssa": 1.38}, "qual": 2}, "Kingdom": {"id": "Kingdom", "name": "model-493", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 424.793, "floor": 1, "navigation_complexity": 4.381, "room": 15, "ssa": 1.941}, "qual": 1}, "Kingfisher": {"id": "Kingfisher", "name": "model-86", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 730.717, "floor": 1, "navigation_complexity": 2.737, "room": 2, "ssa": 1.287}, "qual": 1}, "Kinney": {"id": "Kinney", "name": "model-94", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 132.303, "floor": 1, "navigation_complexity": 1.9, "room": 7, "ssa": 1.602}, "qual": 2}, "Kirksville": {"id": "Kirksville", "name": "model-329", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 926.364, "floor": 6, "navigation_complexity": 3.98, "room": 21, "ssa": 1.156}, "qual": 0}, "Kirwin": {"id": "Kirwin", "name": "model-209", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 851.942, "floor": 3, "navigation_complexity": 5.134, "room": 22, "ssa": 1.868}, "qual": 2}, "Klickitat": {"id": "Klickitat", "name": "model-19", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 1618.439, "floor": 6, "navigation_complexity": 11.77, "room": 20, "ssa": 1.425}, "qual": 2}, "Kobuk": {"id": "Kobuk", "name": "model-179", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 349.003, "floor": 3, "navigation_complexity": 2.498, "room": 15, "ssa": 1.437}, "qual": 2}, "Kopperl": {"id": "Kopperl", "name": "model-495", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 315.216, "floor": 4, "navigation_complexity": 2.315, "room": 17, "ssa": 1.522}, "qual": 2}, "Kremlin": {"id": "Kremlin", "name": "model-318", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 53.553, "floor": 1, "navigation_complexity": 2.029, "room": 4, "ssa": 1.309}, "qual": 3}, "Kronborg": {"id": "Kronborg", "name": "model-189", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 385.662, "floor": 1, "navigation_complexity": 3.23, "room": 4, "ssa": 1.091}, "qual": 2}, "Lacon": {"id": "Lacon", "name": "model-138", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 3121.66, "floor": 2, "navigation_complexity": 6.679, "room": 42, "ssa": 1.827}, "qual": 0}, "Ladue": {"id": "Ladue", "name": "model-549", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 200.588, "floor": 3, "navigation_complexity": 2.303, "room": 9, "ssa": 0.698}, "qual": 4}, "Lajas": {"id": "Lajas", "name": "model-412", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 184.768, "floor": 3, "navigation_complexity": 1.118, "room": 10, "ssa": 0.916}, "qual": 3}, "Lakeville": {"id": "Lakeville", "name": "model-354", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 876.26, "floor": 3, "navigation_complexity": 13.218, "room": 19, "ssa": 2.052}, "qual": 1}, "Landing": {"id": "Landing", "name": "model-122", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 865.108, "floor": 4, "navigation_complexity": 3.033, "room": 16, "ssa": 1.62}, "qual": 3}, "Lathrup": {"id": "Lathrup", "name": "model-496", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 44.215, "floor": 1, "navigation_complexity": 1.648, "room": 3, "ssa": 1.522}, "qual": 4}, "Laupahoehoe": {"id": "Laupahoehoe", "name": "model-465", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 926.358, "floor": 5, "navigation_complexity": 2.169, "room": 28, "ssa": 2.115}, "qual": 1}, "Laytonsville": {"id": "Laytonsville", "name": "model-53", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 58.748, "floor": 1, "navigation_complexity": 1.024, "room": 3, "ssa": 1.416}, "qual": 3}, "Leavittsburg": {"id": "Leavittsburg", "name": "model-134", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 394.078, "floor": 3, "navigation_complexity": 2.964, "room": 18, "ssa": 1.956}, "qual": 2}, "Leilani": {"id": "Leilani", "name": "model-84", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 977.135, "floor": 1, "navigation_complexity": 3.157, "room": 42, "ssa": 1.256}, "qual": 0}, "Lenoir": {"id": "Lenoir", "name": "model-312", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 193.152, "floor": 2, "navigation_complexity": 2.873, "room": 8, "ssa": 1.625}, "qual": 3}, "Leonardo": {"id": "Leonardo", "name": "model-229", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 610.147, "floor": 4, "navigation_complexity": 8.586, "room": 20, "ssa": 1.637}, "qual": 1}, "Lessley": {"id": "Lessley", "name": "model-197", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 734.48, "floor": 4, "navigation_complexity": 4.327, "room": 22, "ssa": 1.455}, "qual": 2}, "Liddieville": {"id": "Liddieville", "name": "model-378", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 336.058, "floor": 1, "navigation_complexity": 1.89, "room": 14, "ssa": 1.175}, "qual": 2}, "Lincolnwood": {"id": "Lincolnwood", "name": "model-509", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 6298.848, "floor": 4, "navigation_complexity": 17.889, "room": 110, "ssa": 1.223}, "qual": 0}, "Lindberg": {"id": "Lindberg", "name": "model-362", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 463.687, "floor": 3, "navigation_complexity": 2.159, "room": 17, "ssa": 1.065}, "qual": 0}, "Lindenwood": {"id": "Lindenwood", "name": "model-476", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 1305.287, "floor": 3, "navigation_complexity": 11.816, "room": 37, "ssa": 1.297}, "qual": 3}, "Lindsborg": {"id": "Lindsborg", "name": "model-379", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 333.7, "floor": 3, "navigation_complexity": 5.038, "room": 19, "ssa": 1.375}, "qual": 2}, "Lineville": {"id": "Lineville", "name": "model-501", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 359.806, "floor": 3, "navigation_complexity": 2.438, "room": 15, "ssa": 1.422}, "qual": 3}, "Lluveras": {"id": "Lluveras", "name": "model-418", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 339.575, "floor": 3, "navigation_complexity": 1.944, "room": 15, "ssa": 1.346}, "qual": 3}, "Losantville": {"id": "Losantville", "name": "model-504", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 1111.809, "floor": 4, "navigation_complexity": 3.684, "room": 22, "ssa": 1.616}, "qual": 2}, "Lovilia": {"id": "Lovilia", "name": "model-282", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 122.258, "floor": 3, "navigation_complexity": 1.313, "room": 7, "ssa": 1.21}, "qual": 4}, "Lucan": {"id": "Lucan", "name": "model-277", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 701.684, "floor": 4, "navigation_complexity": 2.533, "room": 28, "ssa": 1.664}, "qual": 2}, "Ludlowville": {"id": "Ludlowville", "name": "model-240", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 587.497, "floor": 3, "navigation_complexity": 1.494, "room": 17, "ssa": 1.575}, "qual": 2}, "Lynchburg": {"id": "Lynchburg", "name": "model-60", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 284.04, "floor": 3, "navigation_complexity": 5.952, "room": 10, "ssa": 1.165}, "qual": 3}, "Lynxville": {"id": "Lynxville", "name": "model-506", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 79.531, "floor": 1, "navigation_complexity": 1.671, "room": 3, "ssa": 2.016}, "qual": 3}, "Maben": {"id": "Maben", "name": "model-173", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 37.044, "floor": 1, "navigation_complexity": 1.539, "room": 4, "ssa": 1.342}, "qual": 5}, "MacArthur": {"id": "MacArthur", "name": "model-429", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 641.795, "floor": 4, "navigation_complexity": 2.202, "room": 21, "ssa": 1.228}, "qual": 2}, "Macedon": {"id": "Macedon", "name": "model-561", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 571.195, "floor": 3, "navigation_complexity": 1.384, "room": 31, "ssa": 1.02}, "qual": 2}, "Macksville": {"id": "Macksville", "name": "model-480", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 339.319, "floor": 3, "navigation_complexity": 2.154, "room": 15, "ssa": 1.128}, "qual": 2}, "Macland": {"id": "Macland", "name": "model-338", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 247.187, "floor": 3, "navigation_complexity": 5.915, "room": 10, "ssa": 0.742}, "qual": 3}, "Maguayo": {"id": "Maguayo", "name": "model-79", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 550.098, "floor": 1, "navigation_complexity": 18.434, "room": 31, "ssa": 1.356}, "qual": 1}, "Mahtomedi": {"id": "Mahtomedi", "name": "model-361", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 49.658, "floor": 1, "navigation_complexity": 1.742, "room": 2, "ssa": 1.065}, "qual": 3}, "Maida": {"id": "Maida", "name": "model-242", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 30.097, "floor": 1, "navigation_complexity": 2.481, "room": 3, "ssa": 1.094}, "qual": 3}, "Maiden": {"id": "Maiden", "name": "model-129", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 194.383, "floor": 4, "navigation_complexity": 1.977, "room": 13, "ssa": 1.208}, "qual": 2}, "Maitland": {"id": "Maitland", "name": "model-386", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 168.856, "floor": 1, "navigation_complexity": 3.109, "room": 8, "ssa": 1.787}, "qual": 2}, "Mammoth": {"id": "Mammoth", "name": "model-96", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 469.434, "floor": 4, "navigation_complexity": 3.574, "room": 21, "ssa": 1.778}, "qual": 3}, "Manassas": {"id": "Manassas", "name": "model-49", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 510.112, "floor": 3, "navigation_complexity": 2.571, "room": 22, "ssa": 1.151}, "qual": 2}, "Maricopa": {"id": "Maricopa", "name": "model-73", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 243.042, "floor": 3, "navigation_complexity": 1.05, "room": 11, "ssa": 1.863}, "qual": 3}, "Markleeville": {"id": "Markleeville", "name": "model-534", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "val", "stats": {"area": 354.146, "floor": 4, "navigation_complexity": 5.198, "room": 14, "ssa": 0.955}, "qual": 2}, "Marksville": {"id": "Marksville", "name": "model-157", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 585.206, "floor": 1, "navigation_complexity": 3.922, "room": 16, "ssa": 1.38}, "qual": 2}, "Marland": {"id": "Marland", "name": "model-357", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 547.906, "floor": 5, "navigation_complexity": 1.295, "room": 16, "ssa": 1.678}, "qual": 2}, "Marstons": {"id": "Marstons", "name": "model-257", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 1481.287, "floor": 7, "navigation_complexity": 4.051, "room": 43, "ssa": 1.574}, "qual": 2}, "Martinville": {"id": "Martinville", "name": "model-28", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 258.383, "floor": 3, "navigation_complexity": 1.602, "room": 15, "ssa": 1.957}, "qual": 2}, "Maryhill": {"id": "Maryhill", "name": "model-307", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 531.583, "floor": 4, "navigation_complexity": 2.939, "room": 12, "ssa": 1.406}, "qual": 4}, "Mashulaville": {"id": "Mashulaville", "name": "model-180", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 455.396, "floor": 4, "navigation_complexity": 1.597, "room": 31, "ssa": 1.437}, "qual": 3}, "Matoaca": {"id": "Matoaca", "name": "model-22", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 57.048, "floor": 1, "navigation_complexity": 1.881, "room": 5, "ssa": 1.263}, "qual": 4}, "Maugansville": {"id": "Maugansville", "name": "model-513", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 502.758, "floor": 3, "navigation_complexity": 3.097, "room": 21, "ssa": 1.136}, "qual": 2}, "Maunawili": {"id": "Maunawili", "name": "model-301", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 107.755, "floor": 2, "navigation_complexity": 2.763, "room": 4, "ssa": 1.153}, "qual": 3}, "Mayesville": {"id": "Mayesville", "name": "model-363", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 85.262, "floor": 1, "navigation_complexity": 1.0, "room": 2, "ssa": 1.281}, "qual": 3}, "Mazomanie": {"id": "Mazomanie", "name": "model-536", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 599.358, "floor": 3, "navigation_complexity": 2.916, "room": 25, "ssa": 0.794}, "qual": 2}, "McCloud": {"id": "McCloud", "name": "model-72", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 645.828, "floor": 3, "navigation_complexity": 5.655, "room": 26, "ssa": 2.525}, "qual": 2}, "McClure": {"id": "McClure", "name": "model-218", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 463.599, "floor": 3, "navigation_complexity": 1.021, "room": 15, "ssa": 1.303}, "qual": 2}, "McDade": {"id": "McDade", "name": "model-347", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "test", "stats": {"area": 928.005, "floor": 5, "navigation_complexity": 3.704, "room": 26, "ssa": 1.522}, "qual": 2}, "McEwen": {"id": "McEwen", "name": "model-203", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 284.58, "floor": 1, "navigation_complexity": 6.629, "room": 9, "ssa": 1.821}, "qual": 2}, "McKeesport": {"id": "McKeesport", "name": "model-533", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 592.516, "floor": 3, "navigation_complexity": 1.314, "room": 29, "ssa": 0.955}, "qual": 2}, "McNary": {"id": "McNary", "name": "model-184", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 248.132, "floor": 3, "navigation_complexity": 5.342, "room": 12, "ssa": 1.012}, "qual": 2}, "Melstone": {"id": "Melstone", "name": "model-95", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1049.473, "floor": 3, "navigation_complexity": 6.013, "room": 29, "ssa": 1.778}, "qual": 1}, "Mentasta": {"id": "Mentasta", "name": "model-548", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 586.539, "floor": 3, "navigation_complexity": 3.235, "room": 12, "ssa": 1.297}, "qual": 2}, "Mentmore": {"id": "Mentmore", "name": "model-260", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 458.149, "floor": 3, "navigation_complexity": 1.681, "room": 22, "ssa": 1.251}, "qual": 3}, "Merchantville": {"id": "Merchantville", "name": "model-441", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 243.923, "floor": 1, "navigation_complexity": 2.31, "room": 2, "ssa": 2.049}, "qual": 1}, "Merlin": {"id": "Merlin", "name": "model-62", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 569.168, "floor": 3, "navigation_complexity": 2.186, "room": 16, "ssa": 1.643}, "qual": 3}, "Merom": {"id": "Merom", "name": "model-2", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 269.431, "floor": 3, "navigation_complexity": 7.878, "room": 13, "ssa": 1.552}, "qual": 3}, "Mesic": {"id": "Mesic", "name": "model-194", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 375.196, "floor": 4, "navigation_complexity": 3.506, "room": 19, "ssa": 1.413}, "qual": 4}, "Micanopy": {"id": "Micanopy", "name": "model-172", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 374.441, "floor": 4, "navigation_complexity": 2.006, "room": 29, "ssa": 0.498}, "qual": 4}, "Michiana": {"id": "Michiana", "name": "model-462", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 53.665, "floor": 1, "navigation_complexity": 1.049, "room": 5, "ssa": 1.405}, "qual": 3}, "Mifflinburg": {"id": "Mifflinburg", "name": "model-500", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 230.642, "floor": 4, "navigation_complexity": 1.765, "room": 15, "ssa": 1.256}, "qual": 2}, "Mifflintown": {"id": "Mifflintown", "name": "model-531", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 165.089, "floor": 2, "navigation_complexity": 2.151, "room": 28, "ssa": 1.34}, "qual": 4}, "Milaca": {"id": "Milaca", "name": "model-249", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 375.791, "floor": 4, "navigation_complexity": 8.021, "room": 21, "ssa": 1.241}, "qual": 2}, "Milford": {"id": "Milford", "name": "model-467", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 972.814, "floor": 4, "navigation_complexity": 3.448, "room": 28, "ssa": 1.282}, "qual": 2}, "Millbury": {"id": "Millbury", "name": "model-238", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 383.756, "floor": 3, "navigation_complexity": 3.233, "room": 15, "ssa": 1.609}, "qual": 3}, "Moark": {"id": "Moark", "name": "model-280", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 2287.132, "floor": 5, "navigation_complexity": 8.482, "room": 31, "ssa": 1.946}, "qual": 1}, "Moberly": {"id": "Moberly", "name": "model-103", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 442.52, "floor": 3, "navigation_complexity": 1.0, "room": 16, "ssa": 1.179}, "qual": 3}, "Mobridge": {"id": "Mobridge", "name": "model-395", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 106.396, "floor": 1, "navigation_complexity": 2.111, "room": 6, "ssa": 1.491}, "qual": 4}, "Model": {"id": "Model", "name": "model-266", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 366.231, "floor": 3, "navigation_complexity": 2.112, "room": 14, "ssa": 1.337}, "qual": 3}, "Mogadore": {"id": "Mogadore", "name": "model-407", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 2363.158, "floor": 10, "navigation_complexity": 4.48, "room": 30, "ssa": 1.342}, "qual": 3}, "Mogote": {"id": "Mogote", "name": "model-336", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 375.962, "floor": 3, "navigation_complexity": 2.227, "room": 14, "ssa": 1.496}, "qual": 2}, "Monson": {"id": "Monson", "name": "model-135", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 188.722, "floor": 3, "navigation_complexity": 3.146, "room": 8, "ssa": 0.909}, "qual": 4}, "Monticello": {"id": "Monticello", "name": "model-90", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 185.89, "floor": 3, "navigation_complexity": 1.9, "room": 17, "ssa": 0.731}, "qual": 3}, "Montreal": {"id": "Montreal", "name": "model-547", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 63.793, "floor": 1, "navigation_complexity": 3.082, "room": 5, "ssa": 1.297}, "qual": 2}, "Moonachie": {"id": "Moonachie", "name": "model-196", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 2697.304, "floor": 3, "navigation_complexity": 2.112, "room": 29, "ssa": 1.16}, "qual": 1}, "Morris": {"id": "Morris", "name": "model-305", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 154.83, "floor": 3, "navigation_complexity": 4.126, "room": 6, "ssa": 1.988}, "qual": 4}, "Mosinee": {"id": "Mosinee", "name": "model-508", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 55.911, "floor": 1, "navigation_complexity": 2.33, "room": 4, "ssa": 1.126}, "qual": 4}, "Mosquito": {"id": "Mosquito", "name": "model-315", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 419.697, "floor": 3, "navigation_complexity": 4.363, "room": 17, "ssa": 1.241}, "qual": 4}, "Muleshoe": {"id": "Muleshoe", "name": "model-80", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "test", "stats": {"area": 1204.837, "floor": 5, "navigation_complexity": 3.61, "room": 20, "ssa": 1.356}, "qual": 2}, "Mullica": {"id": "Mullica", "name": "model-54", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 282.689, "floor": 3, "navigation_complexity": 2.201, "room": 13, "ssa": 1.416}, "qual": 3}, "Munsons": {"id": "Munsons", "name": "model-10", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 234.973, "floor": 3, "navigation_complexity": 2.481, "room": 11, "ssa": 1.577}, "qual": 2}, "Murchison": {"id": "Murchison", "name": "model-409", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 485.544, "floor": 4, "navigation_complexity": 1.03, "room": 22, "ssa": 0.821}, "qual": 2}, "Musicks": {"id": "Musicks", "name": "model-484", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 209.679, "floor": 3, "navigation_complexity": 1.141, "room": 10, "ssa": 2.332}, "qual": 2}, "Natural": {"id": "Natural", "name": "model-400", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 574.681, "floor": 4, "navigation_complexity": 2.113, "room": 24, "ssa": 1.341}, "qual": 2}, "Neibert": {"id": "Neibert", "name": "model-439", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 578.138, "floor": 3, "navigation_complexity": 7.6, "room": 15, "ssa": 1.576}, "qual": 2}, "Nemacolin": {"id": "Nemacolin", "name": "model-449", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 544.518, "floor": 4, "navigation_complexity": 3.78, "room": 26, "ssa": 1.353}, "qual": 4}, "Neshkoro": {"id": "Neshkoro", "name": "model-274", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 789.459, "floor": 3, "navigation_complexity": 4.976, "room": 26, "ssa": 1.247}, "qual": 1}, "Newcomb": {"id": "Newcomb", "name": "model-283", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 83.141, "floor": 2, "navigation_complexity": 3.346, "room": 6, "ssa": 1.403}, "qual": 3}, "Newfields": {"id": "Newfields", "name": "model-288", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 514.149, "floor": 5, "navigation_complexity": 6.736, "room": 19, "ssa": 1.781}, "qual": 3}, "Nicut": {"id": "Nicut", "name": "model-367", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 90.831, "floor": 1, "navigation_complexity": 3.008, "room": 7, "ssa": 1.332}, "qual": 5}, "Nimmons": {"id": "Nimmons", "name": "model-68", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 792.206, "floor": 3, "navigation_complexity": 6.09, "room": 19, "ssa": 2.049}, "qual": 4}, "Noonday": {"id": "Noonday", "name": "model-224", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 218.453, "floor": 4, "navigation_complexity": 1.0, "room": 16, "ssa": 1.356}, "qual": 3}, "Northgate": {"id": "Northgate", "name": "model-558", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 98.769, "floor": 1, "navigation_complexity": 1.0, "room": 1, "ssa": 1.508}, "qual": 2}, "Norvelt": {"id": "Norvelt", "name": "model-563", "split_full": "none", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 19.028, "floor": 1, "navigation_complexity": 1.392, "room": 3, "ssa": 1.607}, "qual": 2}, "Noxapater": {"id": "Noxapater", "name": "model-264", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "test", "stats": {"area": 308.385, "floor": 4, "navigation_complexity": 4.151, "room": 12, "ssa": 1.82}, "qual": 3}, "Nuevo": {"id": "Nuevo", "name": "model-505", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 254.781, "floor": 3, "navigation_complexity": 1.294, "room": 16, "ssa": 2.016}, "qual": 4}, "Ogilvie": {"id": "Ogilvie", "name": "model-39", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 47.725, "floor": 1, "navigation_complexity": 2.0, "room": 1, "ssa": 1.655}, "qual": 2}, "Ohoopee": {"id": "Ohoopee", "name": "model-248", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 322.951, "floor": 4, "navigation_complexity": 1.585, "room": 19, "ssa": 1.725}, "qual": 3}, "Okabena": {"id": "Okabena", "name": "model-436", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 804.364, "floor": 3, "navigation_complexity": 1.624, "room": 24, "ssa": 1.309}, "qual": 3}, "Onaga": {"id": "Onaga", "name": "model-74", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 182.294, "floor": 3, "navigation_complexity": 5.571, "room": 12, "ssa": 1.863}, "qual": 3}, "Ooltewah": {"id": "Ooltewah", "name": "model-265", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 664.17, "floor": 1, "navigation_complexity": 4.907, "room": 33, "ssa": 1.337}, "qual": 2}, "Ophir": {"id": "Ophir", "name": "model-532", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 64.853, "floor": 1, "navigation_complexity": 1.947, "room": 1, "ssa": 1.34}, "qual": 2}, "Orangeburg": {"id": "Orangeburg", "name": "model-342", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 891.403, "floor": 4, "navigation_complexity": 2.414, "room": 22, "ssa": 1.144}, "qual": 2}, "Orason": {"id": "Orason", "name": "model-205", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 188.151, "floor": 1, "navigation_complexity": 3.851, "room": 2, "ssa": 1.483}, "qual": 2}, "Oriole": {"id": "Oriole", "name": "model-339", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 360.957, "floor": 3, "navigation_complexity": 3.436, "room": 16, "ssa": 1.919}, "qual": 2}, "Ossipee": {"id": "Ossipee", "name": "model-21", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 92.354, "floor": 1, "navigation_complexity": 1.224, "room": 4, "ssa": 1.263}, "qual": 2}, "Ovalo": {"id": "Ovalo", "name": "model-83", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 269.365, "floor": 4, "navigation_complexity": 1.0, "room": 16, "ssa": 1.256}, "qual": 2}, "Oyens": {"id": "Oyens", "name": "model-335", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 100.709, "floor": 1, "navigation_complexity": 5.614, "room": 1, "ssa": 1.496}, "qual": 4}, "Pablo": {"id": "Pablo", "name": "model-486", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 37.789, "floor": 1, "navigation_complexity": 1.0, "room": 4, "ssa": 1.883}, "qual": 4}, "Paige": {"id": "Paige", "name": "model-87", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 612.688, "floor": 3, "navigation_complexity": 2.073, "room": 26, "ssa": 1.046}, "qual": 2}, "Pamelia": {"id": "Pamelia", "name": "model-348", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 237.84, "floor": 3, "navigation_complexity": 5.084, "room": 17, "ssa": 1.522}, "qual": 2}, "Parole": {"id": "Parole", "name": "model-447", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 55.245, "floor": 1, "navigation_complexity": 2.658, "room": 6, "ssa": 1.545}, "qual": 4}, "Pasatiempo": {"id": "Pasatiempo", "name": "model-355", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 159.202, "floor": 1, "navigation_complexity": 1.245, "room": 8, "ssa": 1.834}, "qual": 2}, "Peacock": {"id": "Peacock", "name": "model-34", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 81.688, "floor": 1, "navigation_complexity": 1.0, "room": 9, "ssa": 1.533}, "qual": 3}, "Pearce": {"id": "Pearce", "name": "model-384", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 513.604, "floor": 3, "navigation_complexity": 5.5, "room": 24, "ssa": 1.212}, "qual": 2}, "Peconic": {"id": "Peconic", "name": "model-166", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 494.275, "floor": 4, "navigation_complexity": 1.611, "room": 11, "ssa": 1.277}, "qual": 2}, "Peden": {"id": "Peden", "name": "model-340", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 68.342, "floor": 1, "navigation_complexity": 2.26, "room": 6, "ssa": 1.919}, "qual": 3}, "Pettigrew": {"id": "Pettigrew", "name": "model-525", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 161.586, "floor": 1, "navigation_complexity": 2.456, "room": 11, "ssa": 2.061}, "qual": 4}, "Pinesdale": {"id": "Pinesdale", "name": "model-299", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 1071.716, "floor": 6, "navigation_complexity": 5.686, "room": 19, "ssa": 1.764}, "qual": 2}, "Pittsburg": {"id": "Pittsburg", "name": "model-492", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 152.529, "floor": 4, "navigation_complexity": 1.012, "room": 11, "ssa": 1.024}, "qual": 3}, "Placida": {"id": "Placida", "name": "model-398", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 83.857, "floor": 1, "navigation_complexity": 1.407, "room": 8, "ssa": 1.764}, "qual": 4}, "Pleasant": {"id": "Pleasant", "name": "model-163", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 162.014, "floor": 1, "navigation_complexity": 2.404, "room": 11, "ssa": 1.553}, "qual": 4}, "Plessis": {"id": "Plessis", "name": "model-228", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 103.245, "floor": 2, "navigation_complexity": 2.157, "room": 9, "ssa": 1.821}, "qual": 4}, "Plumerville": {"id": "Plumerville", "name": "model-485", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 553.505, "floor": 3, "navigation_complexity": 2.785, "room": 15, "ssa": 1.883}, "qual": 2}, "Pocasset": {"id": "Pocasset", "name": "model-393", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 343.544, "floor": 4, "navigation_complexity": 1.083, "room": 24, "ssa": 1.049}, "qual": 3}, "Pocopson": {"id": "Pocopson", "name": "model-118", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 296.255, "floor": 1, "navigation_complexity": 3.014, "room": 12, "ssa": 1.297}, "qual": 3}, "Poipu": {"id": "Poipu", "name": "model-271", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 522.602, "floor": 5, "navigation_complexity": 2.825, "room": 15, "ssa": 1.363}, "qual": 2}, "Pomaria": {"id": "Pomaria", "name": "model-430", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 612.16, "floor": 5, "navigation_complexity": 5.661, "room": 23, "ssa": 1.228}, "qual": 3}, "Portal": {"id": "Portal", "name": "model-92", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 857.078, "floor": 4, "navigation_complexity": 1.745, "room": 19, "ssa": 1.03}, "qual": 2}, "Portola": {"id": "Portola", "name": "model-314", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 718.033, "floor": 4, "navigation_complexity": 6.196, "room": 19, "ssa": 0.785}, "qual": 1}, "Potosi": {"id": "Potosi", "name": "model-350", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 513.183, "floor": 4, "navigation_complexity": 1.701, "room": 23, "ssa": 0.81}, "qual": 2}, "Potterville": {"id": "Potterville", "name": "model-457", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 427.967, "floor": 4, "navigation_complexity": 3.929, "room": 12, "ssa": 1.224}, "qual": 2}, "Poyen": {"id": "Poyen", "name": "model-230", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 680.296, "floor": 3, "navigation_complexity": 1.885, "room": 18, "ssa": 1.637}, "qual": 3}, "Purple": {"id": "Purple", "name": "model-210", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 737.666, "floor": 4, "navigation_complexity": 2.051, "room": 30, "ssa": 1.868}, "qual": 4}, "Quantico": {"id": "Quantico", "name": "model-108", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 85.135, "floor": 1, "navigation_complexity": 2.576, "room": 6, "ssa": 1.647}, "qual": 4}, "Rabbit": {"id": "Rabbit", "name": "model-540", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 60.857, "floor": 1, "navigation_complexity": 3.3, "room": 2, "ssa": 1.641}, "qual": 4}, "Ranchester": {"id": "Ranchester", "name": "model-552", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 741.032, "floor": 4, "navigation_complexity": 6.666, "room": 18, "ssa": 1.391}, "qual": 2}, "Rancocas": {"id": "Rancocas", "name": "model-20", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 116.738, "floor": 1, "navigation_complexity": 3.694, "room": 7, "ssa": 1.425}, "qual": 4}, "Random": {"id": "Random", "name": "model-440", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 295.808, "floor": 3, "navigation_complexity": 1.005, "room": 16, "ssa": 1.576}, "qual": 3}, "Readsboro": {"id": "Readsboro", "name": "model-524", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 439.666, "floor": 4, "navigation_complexity": 6.789, "room": 17, "ssa": 1.687}, "qual": 2}, "Redbank": {"id": "Redbank", "name": "model-365", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 673.551, "floor": 5, "navigation_complexity": 3.915, "room": 19, "ssa": 0.988}, "qual": 1}, "Reserve": {"id": "Reserve", "name": "model-7", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 660.009, "floor": 4, "navigation_complexity": 2.8, "room": 20, "ssa": 1.741}, "qual": 3}, "Retsof": {"id": "Retsof", "name": "model-404", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 499.225, "floor": 4, "navigation_complexity": 1.304, "room": 23, "ssa": 1.346}, "qual": 2}, "Reyno": {"id": "Reyno", "name": "model-306", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 736.104, "floor": 3, "navigation_complexity": 4.31, "room": 26, "ssa": 1.988}, "qual": 4}, "Ribera": {"id": "Ribera", "name": "model-487", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 50.645, "floor": 1, "navigation_complexity": 1.666, "room": 3, "ssa": 0.823}, "qual": 4}, "Roane": {"id": "Roane", "name": "model-37", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 65.459, "floor": 1, "navigation_complexity": 1.87, "room": 3, "ssa": 0.936}, "qual": 4}, "Rockport": {"id": "Rockport", "name": "model-415", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 208.37, "floor": 3, "navigation_complexity": 1.395, "room": 12, "ssa": 0.879}, "qual": 4}, "Roeville": {"id": "Roeville", "name": "model-304", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 484.984, "floor": 4, "navigation_complexity": 1.03, "room": 21, "ssa": 1.008}, "qual": 4}, "Rogue": {"id": "Rogue", "name": "model-200", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 440.36, "floor": 4, "navigation_complexity": 3.967, "room": 21, "ssa": 1.546}, "qual": 3}, "Rosenberg": {"id": "Rosenberg", "name": "model-545", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 86.103, "floor": 1, "navigation_complexity": 2.37, "room": 5, "ssa": 1.66}, "qual": 4}, "Rosser": {"id": "Rosser", "name": "model-470", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 29.918, "floor": 1, "navigation_complexity": 1.373, "room": 4, "ssa": 1.238}, "qual": 4}, "Rough": {"id": "Rough", "name": "model-557", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 518.468, "floor": 3, "navigation_complexity": 5.971, "room": 6, "ssa": 1.508}, "qual": 2}, "Roxboro": {"id": "Roxboro", "name": "model-477", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 62.206, "floor": 1, "navigation_complexity": 2.79, "room": 1, "ssa": 1.21}, "qual": 4}, "Ruckersville": {"id": "Ruckersville", "name": "model-268", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 913.224, "floor": 1, "navigation_complexity": 7.826, "room": 39, "ssa": 1.44}, "qual": 3}, "Rutherford": {"id": "Rutherford", "name": "model-51", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 560.797, "floor": 2, "navigation_complexity": 4.638, "room": 16, "ssa": 1.791}, "qual": 3}, "Sagerton": {"id": "Sagerton", "name": "model-222", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 391.372, "floor": 3, "navigation_complexity": 2.814, "room": 23, "ssa": 1.398}, "qual": 3}, "Samuels": {"id": "Samuels", "name": "model-522", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 117.083, "floor": 1, "navigation_complexity": 1.734, "room": 10, "ssa": 1.96}, "qual": 4}, "Sanctuary": {"id": "Sanctuary", "name": "model-310", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 289.285, "floor": 1, "navigation_complexity": 3.823, "room": 19, "ssa": 1.083}, "qual": 4}, "Sands": {"id": "Sands", "name": "model-102", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 153.132, "floor": 3, "navigation_complexity": 2.109, "room": 10, "ssa": 1.352}, "qual": 4}, "Sarcoxie": {"id": "Sarcoxie", "name": "model-332", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 286.12, "floor": 3, "navigation_complexity": 1.089, "room": 12, "ssa": 1.443}, "qual": 2}, "Sargents": {"id": "Sargents", "name": "model-490", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 5930.231, "floor": 3, "navigation_complexity": 18.039, "room": 76, "ssa": 0.885}, "qual": null}, "Sasakwa": {"id": "Sasakwa", "name": "model-144", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 45.35, "floor": 1, "navigation_complexity": 2.543, "room": 5, "ssa": 1.134}, "qual": 4}, "Sawpit": {"id": "Sawpit", "name": "model-459", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 77.847, "floor": 1, "navigation_complexity": 2.074, "room": 6, "ssa": 1.274}, "qual": 4}, "Scandinavia": {"id": "Scandinavia", "name": "model-535", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 531.017, "floor": 3, "navigation_complexity": 2.885, "room": 22, "ssa": 0.794}, "qual": 2}, "Schiller": {"id": "Schiller", "name": "model-416", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 599.407, "floor": 4, "navigation_complexity": 3.049, "room": 22, "ssa": 0.879}, "qual": 2}, "Schoolcraft": {"id": "Schoolcraft", "name": "model-145", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 882.38, "floor": 1, "navigation_complexity": 1.67, "room": 3, "ssa": 1.561}, "qual": 2}, "Scioto": {"id": "Scioto", "name": "model-372", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 512.963, "floor": 5, "navigation_complexity": 1.072, "room": 23, "ssa": 1.116}, "qual": 4}, "Scottsmoor": {"id": "Scottsmoor", "name": "model-567", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 61.099, "floor": 1, "navigation_complexity": 1.0, "room": 1, "ssa": 1.851}, "qual": null}, "Seatonville": {"id": "Seatonville", "name": "model-475", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 2832.57, "floor": 6, "navigation_complexity": 7.291, "room": 3, "ssa": 1.297}, "qual": 3}, "Seeley": {"id": "Seeley", "name": "model-237", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 554.521, "floor": 3, "navigation_complexity": 2.365, "room": 22, "ssa": 1.609}, "qual": 3}, "Seiling": {"id": "Seiling", "name": "model-390", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1020.102, "floor": 1, "navigation_complexity": 2.771, "room": 5, "ssa": 0.82}, "qual": 3}, "Seward": {"id": "Seward", "name": "model-421", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 59.1, "floor": 1, "navigation_complexity": 1.016, "room": 5, "ssa": 1.583}, "qual": 4}, "Sharon": {"id": "Sharon", "name": "model-212", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1398.591, "floor": 6, "navigation_complexity": 6.9, "room": 19, "ssa": 1.104}, "qual": 2}, "Shauck": {"id": "Shauck", "name": "model-445", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 520.761, "floor": 4, "navigation_complexity": 2.128, "room": 23, "ssa": 1.375}, "qual": 3}, "Shelbiana": {"id": "Shelbiana", "name": "model-528", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 97.743, "floor": 1, "navigation_complexity": 3.972, "room": 10, "ssa": 1.601}, "qual": 4}, "Shelbyville": {"id": "Shelbyville", "name": "model-455", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 544.749, "floor": 4, "navigation_complexity": 5.265, "room": 29, "ssa": 1.302}, "qual": 2}, "Shellsburg": {"id": "Shellsburg", "name": "model-44", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1155.826, "floor": 4, "navigation_complexity": 1.116, "room": 24, "ssa": 1.896}, "qual": 1}, "Sherrill": {"id": "Sherrill", "name": "model-553", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 3081.432, "floor": 4, "navigation_complexity": 15.187, "room": 46, "ssa": 1.074}, "qual": 2}, "Shingler": {"id": "Shingler", "name": "model-187", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 105.358, "floor": 1, "navigation_complexity": 1.0, "room": 8, "ssa": 1.662}, "qual": 4}, "Shumway": {"id": "Shumway", "name": "model-419", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 521.75, "floor": 4, "navigation_complexity": 1.995, "room": 21, "ssa": 1.208}, "qual": 2}, "Silas": {"id": "Silas", "name": "model-48", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 345.962, "floor": 3, "navigation_complexity": 1.05, "room": 18, "ssa": 1.241}, "qual": 4}, "Silerton": {"id": "Silerton", "name": "model-269", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 441.747, "floor": 4, "navigation_complexity": 1.438, "room": 17, "ssa": 1.643}, "qual": 3}, "Silva": {"id": "Silva", "name": "model-250", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 626.12, "floor": 3, "navigation_complexity": 3.337, "room": 18, "ssa": 1.241}, "qual": 1}, "Siren": {"id": "Siren", "name": "model-483", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 739.196, "floor": 3, "navigation_complexity": 5.496, "room": 23, "ssa": 2.332}, "qual": 2}, "Sisters": {"id": "Sisters", "name": "model-431", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 94.529, "floor": 2, "navigation_complexity": 1.318, "room": 6, "ssa": 1.743}, "qual": 4}, "Smoketown": {"id": "Smoketown", "name": "model-426", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 617.315, "floor": 3, "navigation_complexity": 1.977, "room": 16, "ssa": 1.189}, "qual": 2}, "Sodaville": {"id": "Sodaville", "name": "model-120", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 114.409, "floor": 2, "navigation_complexity": 2.087, "room": 8, "ssa": 1.431}, "qual": 4}, "Soldier": {"id": "Soldier", "name": "model-270", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 262.672, "floor": 3, "navigation_complexity": 4.147, "room": 11, "ssa": 1.643}, "qual": 4}, "Sontag": {"id": "Sontag", "name": "model-448", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 680.769, "floor": 4, "navigation_complexity": 1.368, "room": 17, "ssa": 1.545}, "qual": 3}, "Southfield": {"id": "Southfield", "name": "model-111", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 797.823, "floor": 5, "navigation_complexity": 1.6, "room": 21, "ssa": 1.609}, "qual": 2}, "Spencerville": {"id": "Spencerville", "name": "model-471", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 72.589, "floor": 1, "navigation_complexity": 1.768, "room": 4, "ssa": 1.149}, "qual": 4}, "Spotswood": {"id": "Spotswood", "name": "model-452", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 292.725, "floor": 3, "navigation_complexity": 3.662, "room": 15, "ssa": 1.415}, "qual": 4}, "Spread": {"id": "Spread", "name": "model-236", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 760.574, "floor": 3, "navigation_complexity": 2.945, "room": 20, "ssa": 1.219}, "qual": 3}, "Springerville": {"id": "Springerville", "name": "model-491", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 291.831, "floor": 3, "navigation_complexity": 5.566, "room": 12, "ssa": 1.024}, "qual": 3}, "Springhill": {"id": "Springhill", "name": "model-43", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 549.54, "floor": 4, "navigation_complexity": 1.388, "room": 11, "ssa": 1.896}, "qual": 4}, "Stanleyville": {"id": "Stanleyville", "name": "model-423", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 283.356, "floor": 3, "navigation_complexity": 3.199, "room": 11, "ssa": 1.353}, "qual": 4}, "Starks": {"id": "Starks", "name": "model-235", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 353.106, "floor": 3, "navigation_complexity": 2.954, "room": 16, "ssa": 1.219}, "qual": 3}, "Stigler": {"id": "Stigler", "name": "model-45", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 3134.479, "floor": 2, "navigation_complexity": 16.449, "room": 37, "ssa": 1.847}, "qual": 0}, "Stilwell": {"id": "Stilwell", "name": "model-514", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 358.399, "floor": 3, "navigation_complexity": 7.782, "room": 16, "ssa": 1.136}, "qual": 4}, "Stockertown": {"id": "Stockertown", "name": "model-152", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 844.615, "floor": 4, "navigation_complexity": 4.842, "room": 37, "ssa": 0.9}, "qual": 2}, "Stockman": {"id": "Stockman", "name": "model-255", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 740.969, "floor": 4, "navigation_complexity": 7.157, "room": 27, "ssa": 2.173}, "qual": 3}, "Stockwell": {"id": "Stockwell", "name": "model-460", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 629.624, "floor": 3, "navigation_complexity": 1.251, "room": 20, "ssa": 1.274}, "qual": 2}, "Stokes": {"id": "Stokes", "name": "model-198", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 276.521, "floor": 3, "navigation_complexity": 4.085, "room": 9, "ssa": 1.455}, "qual": 4}, "Sugarville": {"id": "Sugarville", "name": "model-27", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 564.827, "floor": 3, "navigation_complexity": 5.236, "room": 16, "ssa": 1.957}, "qual": 3}, "Sultan": {"id": "Sultan", "name": "model-427", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 357.887, "floor": 1, "navigation_complexity": 3.541, "room": 16, "ssa": 1.645}, "qual": 3}, "Sumas": {"id": "Sumas", "name": "model-473", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 45.342, "floor": 1, "navigation_complexity": 2.888, "room": 3, "ssa": 1.353}, "qual": 4}, "Sundown": {"id": "Sundown", "name": "model-85", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 165.31, "floor": 1, "navigation_complexity": 3.82, "room": 9, "ssa": 1.287}, "qual": 3}, "Sunshine": {"id": "Sunshine", "name": "model-392", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 1166.008, "floor": 5, "navigation_complexity": 3.056, "room": 28, "ssa": 0.9}, "qual": 2}, "Superior": {"id": "Superior", "name": "model-18", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 220.462, "floor": 1, "navigation_complexity": 5.106, "room": 14, "ssa": 1.84}, "qual": 4}, "Sussex": {"id": "Sussex", "name": "model-364", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 405.669, "floor": 3, "navigation_complexity": 8.84, "room": 17, "ssa": 1.281}, "qual": 2}, "Sweatman": {"id": "Sweatman", "name": "model-413", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 108.333, "floor": 1, "navigation_complexity": 4.382, "room": 6, "ssa": 0.871}, "qual": 3}, "Swisshome": {"id": "Swisshome", "name": "model-370", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 377.373, "floor": 4, "navigation_complexity": 2.844, "room": 20, "ssa": 1.637}, "qual": 3}, "Swormville": {"id": "Swormville", "name": "model-298", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 66.793, "floor": 1, "navigation_complexity": 3.213, "room": 7, "ssa": 0.944}, "qual": 4}, "Tallmadge": {"id": "Tallmadge", "name": "model-185", "split_full": "none", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 56.185, "floor": 1, "navigation_complexity": 1.483, "room": 4, "ssa": 1.442}, "qual": 3}, "Tansboro": {"id": "Tansboro", "name": "model-24", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 378.932, "floor": 3, "navigation_complexity": 1.941, "room": 20, "ssa": 2.077}, "qual": 2}, "Tariffville": {"id": "Tariffville", "name": "model-544", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 160.382, "floor": 1, "navigation_complexity": 3.569, "room": 8, "ssa": 2.011}, "qual": 2}, "Terrell": {"id": "Terrell", "name": "model-127", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 694.299, "floor": 1, "navigation_complexity": 1.837, "room": 34, "ssa": 1.161}, "qual": 0}, "Texasville": {"id": "Texasville", "name": "model-93", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 355.121, "floor": 3, "navigation_complexity": 2.11, "room": 16, "ssa": 1.602}, "qual": 2}, "Thrall": {"id": "Thrall", "name": "model-181", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 1106.305, "floor": 3, "navigation_complexity": 7.456, "room": 28, "ssa": 1.56}, "qual": 1}, "Tilghmanton": {"id": "Tilghmanton", "name": "model-292", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 605.518, "floor": 4, "navigation_complexity": 2.651, "room": 15, "ssa": 1.271}, "qual": 0}, "Tillman": {"id": "Tillman", "name": "model-195", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1387.564, "floor": 1, "navigation_complexity": 4.747, "room": 32, "ssa": 1.16}, "qual": 0}, "Timberon": {"id": "Timberon", "name": "model-161", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 296.948, "floor": 4, "navigation_complexity": 1.6, "room": 17, "ssa": 1.046}, "qual": 2}, "Tippecanoe": {"id": "Tippecanoe", "name": "model-188", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 154.358, "floor": 2, "navigation_complexity": 1.786, "room": 7, "ssa": 1.662}, "qual": 3}, "Tokeland": {"id": "Tokeland", "name": "model-69", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 315.287, "floor": 3, "navigation_complexity": 1.045, "room": 13, "ssa": 1.416}, "qual": 2}, "Tolstoy": {"id": "Tolstoy", "name": "model-454", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 543.053, "floor": 3, "navigation_complexity": 6.657, "room": 19, "ssa": 1.297}, "qual": null}, "Tomales": {"id": "Tomales", "name": "model-42", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 415.188, "floor": 3, "navigation_complexity": 1.044, "room": 17, "ssa": 1.975}, "qual": 2}, "Tomkins": {"id": "Tomkins", "name": "model-77", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 850.745, "floor": 3, "navigation_complexity": 4.928, "room": 28, "ssa": 1.732}, "qual": 0}, "Torrington": {"id": "Torrington", "name": "model-482", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 781.388, "floor": 4, "navigation_complexity": 2.674, "room": 30, "ssa": 1.256}, "qual": 2}, "Touhy": {"id": "Touhy", "name": "model-456", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 668.494, "floor": 4, "navigation_complexity": 2.306, "room": 15, "ssa": 1.302}, "qual": 3}, "Tradewinds": {"id": "Tradewinds", "name": "model-394", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 112.227, "floor": 1, "navigation_complexity": 2.865, "room": 1, "ssa": 1.049}, "qual": 2}, "Trail": {"id": "Trail", "name": "model-162", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 535.171, "floor": 4, "navigation_complexity": 3.123, "room": 20, "ssa": 1.046}, "qual": 2}, "Traver": {"id": "Traver", "name": "model-251", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 390.532, "floor": 3, "navigation_complexity": 3.308, "room": 18, "ssa": 1.494}, "qual": 3}, "Tyler": {"id": "Tyler", "name": "model-26", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 216.845, "floor": 3, "navigation_complexity": 2.21, "room": 13, "ssa": 1.924}, "qual": 3}, "Tysons": {"id": "Tysons", "name": "model-494", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 712.519, "floor": 3, "navigation_complexity": 3.156, "room": 27, "ssa": 1.941}, "qual": 1}, "Umpqua": {"id": "Umpqua", "name": "model-136", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 400.83, "floor": 3, "navigation_complexity": 1.976, "room": 15, "ssa": 0.909}, "qual": 3}, "Uncertain": {"id": "Uncertain", "name": "model-529", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 150.664, "floor": 1, "navigation_complexity": 1.283, "room": 1, "ssa": 0.938}, "qual": 3}, "Upham": {"id": "Upham", "name": "model-389", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1840.942, "floor": 6, "navigation_complexity": 6.869, "room": 48, "ssa": 0.82}, "qual": 2}, "Uvalda": {"id": "Uvalda", "name": "model-131", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "test", "stats": {"area": 616.935, "floor": 5, "navigation_complexity": 3.875, "room": 28, "ssa": 1.174}, "qual": 3}, "Vacherie": {"id": "Vacherie", "name": "model-29", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 455.855, "floor": 3, "navigation_complexity": 5.836, "room": 9, "ssa": 1.673}, "qual": 3}, "Vails": {"id": "Vails", "name": "model-225", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 97.904, "floor": 2, "navigation_complexity": 1.313, "room": 3, "ssa": 1.526}, "qual": 1}, "Victorville": {"id": "Victorville", "name": "model-353", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 455.395, "floor": 4, "navigation_complexity": 5.257, "room": 23, "ssa": 2.052}, "qual": 1}, "Voorhees": {"id": "Voorhees", "name": "model-227", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 269.619, "floor": 1, "navigation_complexity": 4.354, "room": 12, "ssa": 1.821}, "qual": 3}, "Waimea": {"id": "Waimea", "name": "model-88", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 684.211, "floor": 4, "navigation_complexity": 1.404, "room": 28, "ssa": 1.046}, "qual": 2}, "Wainscott": {"id": "Wainscott", "name": "model-541", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 635.302, "floor": 3, "navigation_complexity": 11.597, "room": 21, "ssa": 1.28}, "qual": 1}, "Waipahu": {"id": "Waipahu", "name": "model-444", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 217.547, "floor": 4, "navigation_complexity": 1.459, "room": 10, "ssa": 1.457}, "qual": 3}, "Wakeman": {"id": "Wakeman", "name": "model-286", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 420.228, "floor": 4, "navigation_complexity": 2.558, "room": 27, "ssa": 1.067}, "qual": 3}, "Waldenburg": {"id": "Waldenburg", "name": "model-410", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 289.497, "floor": 1, "navigation_complexity": 5.257, "room": 4, "ssa": 0.821}, "qual": 2}, "Wando": {"id": "Wando", "name": "model-221", "split_full": "test", "split_full+": "test", "split_medium": "test", "split_tiny": "none", "stats": {"area": 75.131, "floor": 1, "navigation_complexity": 3.396, "room": 5, "ssa": 1.398}, "qual": 3}, "Wappingers": {"id": "Wappingers", "name": "model-183", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 905.911, "floor": 3, "navigation_complexity": 3.133, "room": 23, "ssa": 1.012}, "qual": 2}, "Warrenville": {"id": "Warrenville", "name": "model-300", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 394.612, "floor": 4, "navigation_complexity": 1.331, "room": 14, "ssa": 1.764}, "qual": 2}, "Wattsville": {"id": "Wattsville", "name": "model-263", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 128.299, "floor": 1, "navigation_complexity": 1.0, "room": 8, "ssa": 1.82}, "qual": 3}, "Waucousta": {"id": "Waucousta", "name": "model-239", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 372.016, "floor": 4, "navigation_complexity": 2.552, "room": 14, "ssa": 1.575}, "qual": 3}, "Waukeenah": {"id": "Waukeenah", "name": "model-115", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 215.282, "floor": 4, "navigation_complexity": 5.885, "room": 12, "ssa": 1.613}, "qual": 2}, "Webster": {"id": "Webster", "name": "model-175", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 788.23, "floor": 1, "navigation_complexity": 2.64, "room": 8, "ssa": 1.091}, "qual": 1}, "Weleetka": {"id": "Weleetka", "name": "model-245", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 1357.157, "floor": 4, "navigation_complexity": 8.402, "room": 26, "ssa": 0.498}, "qual": 2}, "Wells": {"id": "Wells", "name": "model-371", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 120.708, "floor": 1, "navigation_complexity": 3.345, "room": 9, "ssa": 1.116}, "qual": 3}, "Wesley": {"id": "Wesley", "name": "model-70", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 477.22, "floor": 3, "navigation_complexity": 2.512, "room": 19, "ssa": 1.416}, "qual": 2}, "Westerville": {"id": "Westerville", "name": "model-297", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 359.709, "floor": 3, "navigation_complexity": 2.337, "room": 13, "ssa": 0.944}, "qual": 3}, "Westfield": {"id": "Westfield", "name": "model-319", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 474.859, "floor": 3, "navigation_complexity": 5.559, "room": 23, "ssa": 1.593}, "qual": 3}, "Whiteriver": {"id": "Whiteriver", "name": "model-262", "split_full": "none", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 331.766, "floor": 5, "navigation_complexity": 1.967, "room": 25, "ssa": 1.438}, "qual": 1}, "Whitethorn": {"id": "Whitethorn", "name": "model-154", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 108.069, "floor": 1, "navigation_complexity": 1.21, "room": 6, "ssa": 1.008}, "qual": 2}, "Wiconisco": {"id": "Wiconisco", "name": "model-137", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "val", "stats": {"area": 736.625, "floor": 5, "navigation_complexity": 6.327, "room": 28, "ssa": 1.827}, "qual": 3}, "Wilbraham": {"id": "Wilbraham", "name": "model-503", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 75.49, "floor": 1, "navigation_complexity": 1.925, "room": 6, "ssa": 1.616}, "qual": 3}, "Wilkesboro": {"id": "Wilkesboro", "name": "model-192", "split_full": "test", "split_full+": "test", "split_medium": "none", "split_tiny": "none", "stats": {"area": 734.661, "floor": 4, "navigation_complexity": 1.791, "room": 19, "ssa": 1.195}, "qual": 3}, "Wilkinsburg": {"id": "Wilkinsburg", "name": "model-377", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 518.876, "floor": 3, "navigation_complexity": 4.693, "room": 24, "ssa": 1.175}, "qual": 2}, "Willow": {"id": "Willow", "name": "model-151", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 662.264, "floor": 3, "navigation_complexity": 6.441, "room": 24, "ssa": 0.9}, "qual": 2}, "Wilseyville": {"id": "Wilseyville", "name": "model-174", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 338.51, "floor": 4, "navigation_complexity": 2.372, "room": 21, "ssa": 1.342}, "qual": 1}, "Windhorst": {"id": "Windhorst", "name": "model-559", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 88.42, "floor": 1, "navigation_complexity": 1.518, "room": 7, "ssa": 0.678}, "qual": 1}, "Winfield": {"id": "Winfield", "name": "model-414", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 848.274, "floor": 5, "navigation_complexity": 1.733, "room": 19, "ssa": 0.871}, "qual": 1}, "Winooski": {"id": "Winooski", "name": "model-337", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "none", "stats": {"area": 1028.84, "floor": 5, "navigation_complexity": 4.2, "room": 70, "ssa": 0.742}, "qual": 3}, "Winthrop": {"id": "Winthrop", "name": "model-55", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 419.476, "floor": 4, "navigation_complexity": 1.0, "room": 18, "ssa": 1.931}, "qual": 3}, "Woodbine": {"id": "Woodbine", "name": "model-411", "split_full": "train", "split_full+": "train", "split_medium": "train", "split_tiny": "train", "stats": {"area": 984.751, "floor": 4, "navigation_complexity": 7.201, "room": 20, "ssa": 0.916}, "qual": 1}, "Woonsocket": {"id": "Woonsocket", "name": "model-12", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 82.938, "floor": 1, "navigation_complexity": 2.273, "room": 4, "ssa": 1.91}, "qual": 4}, "Wyatt": {"id": "Wyatt", "name": "model-478", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 716.924, "floor": 3, "navigation_complexity": 9.961, "room": 22, "ssa": 1.21}, "qual": 2}, "Wyldwood": {"id": "Wyldwood", "name": "model-330", "split_full": "val", "split_full+": "val", "split_medium": "val", "split_tiny": "none", "stats": {"area": 191.908, "floor": 4, "navigation_complexity": 1.226, "room": 12, "ssa": 1.156}, "qual": 2}, "Yadkinville": {"id": "Yadkinville", "name": "model-571", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 504.718, "floor": 3, "navigation_complexity": 3.82, "room": 18, "ssa": 1.402}, "qual": 0}, "Yankeetown": {"id": "Yankeetown", "name": "model-321", "split_full": "train", "split_full+": "train", "split_medium": "none", "split_tiny": "none", "stats": {"area": 842.17, "floor": 3, "navigation_complexity": 2.917, "room": 17, "ssa": 1.68}, "qual": 0}, "Yscloskey": {"id": "Yscloskey", "name": "model-458", "split_full": "val", "split_full+": "val", "split_medium": "none", "split_tiny": "none", "stats": {"area": 370.472, "floor": 3, "navigation_complexity": 3.26, "room": 16, "ssa": 1.224}, "qual": 2}} diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/multi_node_slurm.sh b/habitat-lab-dialog/habitat_baselines/rl/ddppo/multi_node_slurm.sh new file mode 100644 index 0000000..db98b32 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/multi_node_slurm.sh @@ -0,0 +1,23 @@ +#!/bin/bash +#SBATCH --job-name=ddppo +#SBATCH --output=logs.ddppo.out +#SBATCH --error=logs.ddppo.err +#SBATCH --gres gpu:1 +#SBATCH --nodes 1 +#SBATCH --cpus-per-task 10 +#SBATCH --ntasks-per-node 1 +#SBATCH --mem=60GB +#SBATCH --time=12:00 +#SBATCH --signal=USR1@600 +#SBATCH --partition=dev + +export GLOG_minloglevel=2 +export MAGNUM_LOG=quiet + +MASTER_ADDR=$(srun --ntasks=1 hostname 2>&1 | tail -n1) +export MASTER_ADDR + +set -x +srun python -u -m habitat_baselines.run \ + --exp-config habitat_baselines/config/pointnav/ddppo_pointnav.yaml \ + --run-type train diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/__init__.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/__init__.py new file mode 100644 index 0000000..429557a --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from .resnet_policy import PointNavResNetPolicy # noqa: F401. diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/resnet.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/resnet.py new file mode 100644 index 0000000..c2c42b9 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/resnet.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Type, Union, cast + +from torch import Tensor +from torch import nn as nn +from torch.nn.modules.container import Sequential +from torch.nn.modules.conv import Conv2d + + +def conv3x3( + in_planes: int, out_planes: int, stride: int = 1, groups: int = 1 +) -> Conv2d: + """3x3 convolution with padding""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + groups=groups, + ) + + +def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> Conv2d: + """1x1 convolution""" + return nn.Conv2d( + in_planes, out_planes, kernel_size=1, stride=stride, bias=False + ) + + +class BasicBlock(nn.Module): + expansion = 1 + resneXt = False + + def __init__( + self, + inplanes, + planes, + ngroups, + stride=1, + downsample=None, + cardinality=1, + ): + super(BasicBlock, self).__init__() + self.convs = nn.Sequential( + conv3x3(inplanes, planes, stride, groups=cardinality), + nn.GroupNorm(ngroups, planes), + nn.ReLU(True), + conv3x3(planes, planes, groups=cardinality), + nn.GroupNorm(ngroups, planes), + ) + self.downsample = downsample + self.relu = nn.ReLU(True) + + def forward(self, x): + residual = x + + out = self.convs(x) + + if self.downsample is not None: + residual = self.downsample(x) + + return self.relu(out + residual) + + +def _build_bottleneck_branch( + inplanes: int, + planes: int, + ngroups: int, + stride: int, + expansion: int, + groups: int = 1, +) -> Sequential: + return nn.Sequential( + conv1x1(inplanes, planes), + nn.GroupNorm(ngroups, planes), + nn.ReLU(True), + conv3x3(planes, planes, stride, groups=groups), + nn.GroupNorm(ngroups, planes), + nn.ReLU(True), + conv1x1(planes, planes * expansion), + nn.GroupNorm(ngroups, planes * expansion), + ) + + +class SE(nn.Module): + def __init__(self, planes, r=16): + super().__init__() + self.squeeze = nn.AdaptiveAvgPool2d(1) + self.excite = nn.Sequential( + nn.Linear(planes, int(planes / r)), + nn.ReLU(True), + nn.Linear(int(planes / r), planes), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + x = self.squeeze(x) + x = x.view(b, c) + x = self.excite(x) + + return x.view(b, c, 1, 1) + + +def _build_se_branch(planes, r=16): + return SE(planes, r) + + +class Bottleneck(nn.Module): + expansion = 4 + resneXt = False + + def __init__( + self, + inplanes: int, + planes: int, + ngroups: int, + stride: int = 1, + downsample: Optional[Sequential] = None, + cardinality: int = 1, + ) -> None: + super().__init__() + self.convs = _build_bottleneck_branch( + inplanes, + planes, + ngroups, + stride, + self.expansion, + groups=cardinality, + ) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + def _impl(self, x: Tensor) -> Tensor: + identity = x + + out = self.convs(x) + + if self.downsample is not None: + identity = self.downsample(x) + + return self.relu(out + identity) + + def forward(self, x: Tensor) -> Tensor: + return self._impl(x) + + +class SEBottleneck(Bottleneck): + def __init__( + self, + inplanes, + planes, + ngroups, + stride=1, + downsample=None, + cardinality=1, + ): + super().__init__( + inplanes, planes, ngroups, stride, downsample, cardinality + ) + + self.se = _build_se_branch(planes * self.expansion) + + def _impl(self, x): + identity = x + + out = self.convs(x) + out = self.se(out) * out + + if self.downsample is not None: + identity = self.downsample(x) + + return self.relu(out + identity) + + +class SEResNeXtBottleneck(SEBottleneck): + expansion = 2 + resneXt = True + + +class ResNeXtBottleneck(Bottleneck): + expansion = 2 + resneXt = True + + +Block = Union[Type[Bottleneck], Type[BasicBlock]] + + +class ResNet(nn.Module): + def __init__( + self, + in_channels: int, + base_planes: int, + ngroups: int, + block: Block, + layers: List[int], + cardinality: int = 1, + ) -> None: + super(ResNet, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d( + in_channels, + base_planes, + kernel_size=7, + stride=2, + padding=3, + bias=False, + ), + nn.GroupNorm(ngroups, base_planes), + nn.ReLU(True), + ) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.cardinality = cardinality + + self.inplanes = base_planes + if block.resneXt: + base_planes *= 2 + + self.layer1 = self._make_layer(block, ngroups, base_planes, layers[0]) + self.layer2 = self._make_layer( + block, ngroups, base_planes * 2, layers[1], stride=2 + ) + self.layer3 = self._make_layer( + block, ngroups, base_planes * 2 * 2, layers[2], stride=2 + ) + self.layer4 = self._make_layer( + block, ngroups, base_planes * 2 * 2 * 2, layers[3], stride=2 + ) + + self.final_channels = self.inplanes + self.final_spatial_compress = 1.0 / (2 ** 5) + + def _make_layer( + self, + block: Block, + ngroups: int, + planes: int, + blocks: int, + stride: int = 1, + ) -> Sequential: + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + nn.GroupNorm(ngroups, planes * block.expansion), + ) + + layers = [] + layers.append( + block( + self.inplanes, + planes, + ngroups, + stride, + downsample, + cardinality=self.cardinality, + ) + ) + self.inplanes = planes * block.expansion + for _i in range(1, blocks): + layers.append(block(self.inplanes, planes, ngroups)) + + return nn.Sequential(*layers) + + def forward(self, x) -> Tensor: + x = self.conv1(x) + x = self.maxpool(x) + x = cast(Tensor, x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + return x + + +def resnet18(in_channels, base_planes, ngroups): + model = ResNet(in_channels, base_planes, ngroups, BasicBlock, [2, 2, 2, 2]) + + return model + + +def resnet50(in_channels: int, base_planes: int, ngroups: int) -> ResNet: + model = ResNet(in_channels, base_planes, ngroups, Bottleneck, [3, 4, 6, 3]) + + return model + + +def resneXt50(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, + base_planes, + ngroups, + ResNeXtBottleneck, + [3, 4, 6, 3], + cardinality=int(base_planes / 2), + ) + + return model + + +def se_resnet50(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, base_planes, ngroups, SEBottleneck, [3, 4, 6, 3] + ) + + return model + + +def se_resneXt50(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, + base_planes, + ngroups, + SEResNeXtBottleneck, + [3, 4, 6, 3], + cardinality=int(base_planes / 2), + ) + + return model + + +def se_resneXt101(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, + base_planes, + ngroups, + SEResNeXtBottleneck, + [3, 4, 23, 3], + cardinality=int(base_planes / 2), + ) + + return model diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/resnet_policy.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/resnet_policy.py new file mode 100644 index 0000000..91465c7 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/resnet_policy.py @@ -0,0 +1,449 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Dict, Tuple + +import numpy as np +import torch +from gym import spaces +from torch import nn as nn +from torch.nn import functional as F + +from habitat.config import Config +from habitat.tasks.nav.nav import ( + EpisodicCompassSensor, + EpisodicGPSSensor, + HeadingSensor, + ImageGoalSensor, + IntegratedPointGoalGPSAndCompassSensor, + PointGoalSensor, + ProximitySensor, +) +from habitat.tasks.nav.object_nav_task import ObjectGoalSensor +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.rl.ddppo.policy import resnet +from habitat_baselines.rl.ddppo.policy.running_mean_and_var import ( + RunningMeanAndVar, +) +from habitat_baselines.rl.models.rnn_state_encoder import ( + build_rnn_state_encoder, +) +from habitat_baselines.rl.ppo import Net, Policy + + +@baseline_registry.register_policy +class PointNavResNetPolicy(Policy): + def __init__( + self, + observation_space: spaces.Dict, + action_space, + hidden_size: int = 512, + num_recurrent_layers: int = 1, + rnn_type: str = "GRU", + resnet_baseplanes: int = 32, + backbone: str = "resnet18", + normalize_visual_inputs: bool = False, + force_blind_policy: bool = False, + **kwargs + ): + super().__init__( + PointNavResNetNet( + observation_space=observation_space, + action_space=action_space, + hidden_size=hidden_size, + num_recurrent_layers=num_recurrent_layers, + rnn_type=rnn_type, + backbone=backbone, + resnet_baseplanes=resnet_baseplanes, + normalize_visual_inputs=normalize_visual_inputs, + force_blind_policy=force_blind_policy, + ), + action_space.n, + ) + + @classmethod + def from_config( + cls, config: Config, observation_space: spaces.Dict, action_space + ): + return cls( + observation_space=observation_space, + action_space=action_space, + hidden_size=config.RL.PPO.hidden_size, + rnn_type=config.RL.DDPPO.rnn_type, + num_recurrent_layers=config.RL.DDPPO.num_recurrent_layers, + backbone=config.RL.DDPPO.backbone, + normalize_visual_inputs="rgb" in observation_space.spaces, + force_blind_policy=config.FORCE_BLIND_POLICY, + ) + + +class ResNetEncoder(nn.Module): + def __init__( + self, + observation_space: spaces.Dict, + baseplanes: int = 32, + ngroups: int = 32, + spatial_size: int = 128, + make_backbone=None, + normalize_visual_inputs: bool = False, + ): + super().__init__() + + if "rgb" in observation_space.spaces: + self._n_input_rgb = observation_space.spaces["rgb"].shape[2] + spatial_size = observation_space.spaces["rgb"].shape[0] // 2 + else: + self._n_input_rgb = 0 + + if "depth" in observation_space.spaces: + self._n_input_depth = observation_space.spaces["depth"].shape[2] + spatial_size = observation_space.spaces["depth"].shape[0] // 2 + else: + self._n_input_depth = 0 + + if normalize_visual_inputs: + self.running_mean_and_var: nn.Module = RunningMeanAndVar( + self._n_input_depth + self._n_input_rgb + ) + else: + self.running_mean_and_var = nn.Sequential() + + if not self.is_blind: + input_channels = self._n_input_depth + self._n_input_rgb + self.backbone = make_backbone(input_channels, baseplanes, ngroups) + + final_spatial = int( + spatial_size * self.backbone.final_spatial_compress + ) + after_compression_flat_size = 2048 + num_compression_channels = int( + round(after_compression_flat_size / (final_spatial ** 2)) + ) + self.compression = nn.Sequential( + nn.Conv2d( + self.backbone.final_channels, + num_compression_channels, + kernel_size=3, + padding=1, + bias=False, + ), + nn.GroupNorm(1, num_compression_channels), + nn.ReLU(True), + ) + + self.output_shape = ( + num_compression_channels, + final_spatial, + final_spatial, + ) + + @property + def is_blind(self): + return self._n_input_rgb + self._n_input_depth == 0 + + def layer_init(self): + for layer in self.modules(): + if isinstance(layer, (nn.Conv2d, nn.Linear)): + nn.init.kaiming_normal_( + layer.weight, nn.init.calculate_gain("relu") + ) + if layer.bias is not None: + nn.init.constant_(layer.bias, val=0) + + def forward(self, observations: Dict[str, torch.Tensor]) -> torch.Tensor: # type: ignore + if self.is_blind: + return None + + cnn_input = [] + if self._n_input_rgb > 0: + rgb_observations = observations["rgb"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + rgb_observations = rgb_observations.permute(0, 3, 1, 2) + rgb_observations = ( + rgb_observations.float() / 255.0 + ) # normalize RGB + cnn_input.append(rgb_observations) + + if self._n_input_depth > 0: + depth_observations = observations["depth"] + + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + depth_observations = depth_observations.permute(0, 3, 1, 2) + + cnn_input.append(depth_observations) + + x = torch.cat(cnn_input, dim=1) + x = F.avg_pool2d(x, 2) + + x = self.running_mean_and_var(x) + x = self.backbone(x) + x = self.compression(x) + return x + + +class PointNavResNetNet(Net): + """Network which passes the input image through CNN and concatenates + goal vector with CNN's output and passes that through RNN. + """ + + def __init__( + self, + observation_space: spaces.Dict, + action_space, + hidden_size: int, + num_recurrent_layers: int, + rnn_type: str, + backbone, + resnet_baseplanes, + normalize_visual_inputs: bool, + force_blind_policy: bool = False, + ): + super().__init__() + + self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) + self._n_prev_action = 32 + rnn_input_size = self._n_prev_action + + if ( + IntegratedPointGoalGPSAndCompassSensor.cls_uuid + in observation_space.spaces + ): + n_input_goal = ( + observation_space.spaces[ + IntegratedPointGoalGPSAndCompassSensor.cls_uuid + ].shape[0] + + 1 + ) + self.tgt_embeding = nn.Linear(n_input_goal, 32) + rnn_input_size += 32 + + if ObjectGoalSensor.cls_uuid in observation_space.spaces: + self._n_object_categories = ( + int( + observation_space.spaces[ObjectGoalSensor.cls_uuid].high[0] + ) + + 1 + ) + self.obj_categories_embedding = nn.Embedding( + self._n_object_categories, 32 + ) + rnn_input_size += 32 + + if EpisodicGPSSensor.cls_uuid in observation_space.spaces: + input_gps_dim = observation_space.spaces[ + EpisodicGPSSensor.cls_uuid + ].shape[0] + self.gps_embedding = nn.Linear(input_gps_dim, 32) + rnn_input_size += 32 + + if PointGoalSensor.cls_uuid in observation_space.spaces: + input_pointgoal_dim = observation_space.spaces[ + PointGoalSensor.cls_uuid + ].shape[0] + self.pointgoal_embedding = nn.Linear(input_pointgoal_dim, 32) + rnn_input_size += 32 + + if HeadingSensor.cls_uuid in observation_space.spaces: + input_heading_dim = ( + observation_space.spaces[HeadingSensor.cls_uuid].shape[0] + 1 + ) + assert input_heading_dim == 2, "Expected heading with 2D rotation." + self.heading_embedding = nn.Linear(input_heading_dim, 32) + rnn_input_size += 32 + + if ProximitySensor.cls_uuid in observation_space.spaces: + input_proximity_dim = observation_space.spaces[ + ProximitySensor.cls_uuid + ].shape[0] + self.proximity_embedding = nn.Linear(input_proximity_dim, 32) + rnn_input_size += 32 + + if EpisodicCompassSensor.cls_uuid in observation_space.spaces: + assert ( + observation_space.spaces[EpisodicCompassSensor.cls_uuid].shape[ + 0 + ] + == 1 + ), "Expected compass with 2D rotation." + input_compass_dim = 2 # cos and sin of the angle + self.compass_embedding = nn.Linear(input_compass_dim, 32) + rnn_input_size += 32 + + if ImageGoalSensor.cls_uuid in observation_space.spaces: + goal_observation_space = spaces.Dict( + {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]} + ) + self.goal_visual_encoder = ResNetEncoder( + goal_observation_space, + baseplanes=resnet_baseplanes, + ngroups=resnet_baseplanes // 2, + make_backbone=getattr(resnet, backbone), + normalize_visual_inputs=normalize_visual_inputs, + ) + + self.goal_visual_fc = nn.Sequential( + nn.Flatten(), + nn.Linear( + np.prod(self.goal_visual_encoder.output_shape), hidden_size + ), + nn.ReLU(True), + ) + + rnn_input_size += hidden_size + + self._hidden_size = hidden_size + + self.visual_encoder = ResNetEncoder( + observation_space if not force_blind_policy else spaces.Dict({}), + baseplanes=resnet_baseplanes, + ngroups=resnet_baseplanes // 2, + make_backbone=getattr(resnet, backbone), + normalize_visual_inputs=normalize_visual_inputs, + ) + + if not self.visual_encoder.is_blind: + self.visual_fc = nn.Sequential( + nn.Flatten(), + nn.Linear( + np.prod(self.visual_encoder.output_shape), hidden_size + ), + nn.ReLU(True), + ) + + self.state_encoder = build_rnn_state_encoder( + (0 if self.is_blind else self._hidden_size) + rnn_input_size, + self._hidden_size, + rnn_type=rnn_type, + num_layers=num_recurrent_layers, + ) + + self.train() + + @property + def output_size(self): + return self._hidden_size + + @property + def is_blind(self): + return self.visual_encoder.is_blind + + @property + def num_recurrent_layers(self): + return self.state_encoder.num_recurrent_layers + + def forward( + self, + observations: Dict[str, torch.Tensor], + rnn_hidden_states, + prev_actions, + masks, + ) -> Tuple[torch.Tensor, torch.Tensor]: + x = [] + if not self.is_blind: + if "visual_features" in observations: + visual_feats = observations["visual_features"] + else: + visual_feats = self.visual_encoder(observations) + + visual_feats = self.visual_fc(visual_feats) + x.append(visual_feats) + + if IntegratedPointGoalGPSAndCompassSensor.cls_uuid in observations: + goal_observations = observations[ + IntegratedPointGoalGPSAndCompassSensor.cls_uuid + ] + if goal_observations.shape[1] == 2: + # Polar Dimensionality 2 + # 2D polar transform + goal_observations = torch.stack( + [ + goal_observations[:, 0], + torch.cos(-goal_observations[:, 1]), + torch.sin(-goal_observations[:, 1]), + ], + -1, + ) + else: + assert ( + goal_observations.shape[1] == 3 + ), "Unsupported dimensionality" + vertical_angle_sin = torch.sin(goal_observations[:, 2]) + # Polar Dimensionality 3 + # 3D Polar transformation + goal_observations = torch.stack( + [ + goal_observations[:, 0], + torch.cos(-goal_observations[:, 1]) + * vertical_angle_sin, + torch.sin(-goal_observations[:, 1]) + * vertical_angle_sin, + torch.cos(goal_observations[:, 2]), + ], + -1, + ) + + x.append(self.tgt_embeding(goal_observations)) + + if PointGoalSensor.cls_uuid in observations: + goal_observations = observations[PointGoalSensor.cls_uuid] + x.append(self.pointgoal_embedding(goal_observations)) + + if ProximitySensor.cls_uuid in observations: + sensor_observations = observations[ProximitySensor.cls_uuid] + x.append(self.proximity_embedding(sensor_observations)) + + if HeadingSensor.cls_uuid in observations: + sensor_observations = observations[HeadingSensor.cls_uuid] + sensor_observations = torch.stack( + [ + torch.cos(sensor_observations[0]), + torch.sin(sensor_observations[0]), + ], + -1, + ) + x.append(self.heading_embedding(sensor_observations)) + + if ObjectGoalSensor.cls_uuid in observations: + object_goal = observations[ObjectGoalSensor.cls_uuid].long() + x.append(self.obj_categories_embedding(object_goal).squeeze(dim=1)) + + if EpisodicCompassSensor.cls_uuid in observations: + compass_observations = torch.stack( + [ + torch.cos(observations[EpisodicCompassSensor.cls_uuid]), + torch.sin(observations[EpisodicCompassSensor.cls_uuid]), + ], + -1, + ) + x.append( + self.compass_embedding(compass_observations.squeeze(dim=1)) + ) + + if EpisodicGPSSensor.cls_uuid in observations: + x.append( + self.gps_embedding(observations[EpisodicGPSSensor.cls_uuid]) + ) + + if ImageGoalSensor.cls_uuid in observations: + goal_image = observations[ImageGoalSensor.cls_uuid] + goal_output = self.goal_visual_encoder({"rgb": goal_image}) + x.append(self.goal_visual_fc(goal_output)) + + prev_actions = prev_actions.squeeze(-1) + start_token = torch.zeros_like(prev_actions) + prev_actions = self.prev_action_embedding( + torch.where(masks.view(-1), prev_actions + 1, start_token) + ) + + x.append(prev_actions) + + out = torch.cat(x, dim=1) + out, rnn_hidden_states = self.state_encoder( + out, rnn_hidden_states, masks + ) + + return out, rnn_hidden_states diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/running_mean_and_var.py b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/running_mean_and_var.py new file mode 100644 index 0000000..45375f8 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/policy/running_mean_and_var.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import Tensor +from torch import distributed as distrib +from torch import nn as nn + + +class RunningMeanAndVar(nn.Module): + def __init__(self, n_channels: int) -> None: + super().__init__() + self.register_buffer("_mean", torch.zeros(1, n_channels, 1, 1)) + self.register_buffer("_var", torch.zeros(1, n_channels, 1, 1)) + self.register_buffer("_count", torch.zeros(())) + self._mean: torch.Tensor = self._mean + self._var: torch.Tensor = self._var + self._count: torch.Tensor = self._count + + def forward(self, x: Tensor) -> Tensor: + if self.training: + n = x.size(0) + # We will need to do reductions (mean) over the channel dimension, + # so moving channels to the first dimension and then flattening + # will make those faster. Further, it makes things more numerically stable + # for fp16 since it is done in a single reduction call instead of + # multiple + x_channels_first = ( + x.transpose(1, 0).contiguous().view(x.size(1), -1) + ) + new_mean = x_channels_first.mean(-1, keepdim=True) + new_count = torch.full_like(self._count, n) + + if distrib.is_initialized(): + distrib.all_reduce(new_mean) + distrib.all_reduce(new_count) + new_mean /= distrib.get_world_size() + + new_var = ( + (x_channels_first - new_mean).pow(2).mean(dim=-1, keepdim=True) + ) + + if distrib.is_initialized(): + distrib.all_reduce(new_var) + new_var /= distrib.get_world_size() + + new_mean = new_mean.view(1, -1, 1, 1) + new_var = new_var.view(1, -1, 1, 1) + + m_a = self._var * (self._count) + m_b = new_var * (new_count) + M2 = ( + m_a + + m_b + + (new_mean - self._mean).pow(2) + * self._count + * new_count + / (self._count + new_count) + ) + + self._var = M2 / (self._count + new_count) + self._mean = (self._count * self._mean + new_count * new_mean) / ( + self._count + new_count + ) + + self._count += new_count + + inv_stdev = torch.rsqrt( + torch.max(self._var, torch.full_like(self._var, 1e-2)) + ) + # This is the same as + # (x - self._mean) * inv_stdev but is faster since it can + # make use of addcmul and is more numerically stable in fp16 + return torch.addcmul(-self._mean * inv_stdev, x, inv_stdev) diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/requirements.txt b/habitat-lab-dialog/habitat_baselines/rl/ddppo/requirements.txt new file mode 100644 index 0000000..0c9b585 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/requirements.txt @@ -0,0 +1 @@ +ifcfg diff --git a/habitat-lab-dialog/habitat_baselines/rl/ddppo/single_node.sh b/habitat-lab-dialog/habitat_baselines/rl/ddppo/single_node.sh new file mode 100644 index 0000000..67eb874 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ddppo/single_node.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +export GLOG_minloglevel=2 +export MAGNUM_LOG=quiet + +set -x +python -u -m torch.distributed.launch \ + --use_env \ + --nproc_per_node 1 \ + habitat_baselines/run.py \ + --exp-config habitat_baselines/config/pointnav/ddppo_pointnav.yaml \ + --run-type train diff --git a/habitat-lab-dialog/habitat_baselines/rl/models/__init__.py b/habitat-lab-dialog/habitat_baselines/rl/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/habitat-lab-dialog/habitat_baselines/rl/models/rnn_state_encoder.py b/habitat-lab-dialog/habitat_baselines/rl/models/rnn_state_encoder.py new file mode 100644 index 0000000..681a3bf --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/models/rnn_state_encoder.py @@ -0,0 +1,421 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +import torch.nn as nn +from torch.nn.utils.rnn import PackedSequence + + +def _invert_permutation(permutation: torch.Tensor) -> torch.Tensor: + output = torch.empty_like(permutation) + output.scatter_( + 0, + permutation, + torch.arange(0, permutation.numel(), device=permutation.device), + ) + return output + + +def _build_pack_info_from_dones( + dones: torch.Tensor, + T: int, +) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor +]: + r"""Create the indexing info needed to make the PackedSequence + based on the dones. + + PackedSequences are PyTorch's way of supporting a single RNN forward + call where each input in the batch can have an arbitrary sequence length + + They work as follows: Given the sequences [c], [x, y, z], [a, b], + we generate data [x, a, c, y, b, z] and batch_sizes [3, 2, 1]. The + data is a flattened out version of the input sequences (the ordering in + data is determined by sequence length). batch_sizes tells you that + for each index, how many sequences have a length of (index + 1) or greater. + + This method will generate the new index ordering such that you can + construct the data for a PackedSequence from a (T*N, ...) tensor + via x.index_select(0, select_inds) + """ + dones = dones.view(T, -1) + N = dones.size(1) + + rollout_boundaries = dones.clone().detach() + # Force a rollout boundary for t=0. We will use the + # original dones for masking later, so this is fine + # and simplifies logic considerably + rollout_boundaries[0] = True + rollout_boundaries = rollout_boundaries.nonzero(as_tuple=False) + + # The rollout_boundaries[:, 0]*N will make the episode_starts index into + # the T*N flattened tensors + episode_starts = rollout_boundaries[:, 0] * N + rollout_boundaries[:, 1] + + # We need to create a transposed start indexing so we can compute episode lengths + # As if we make the starts index into a N*T tensor, then starts[1] - starts[0] + # will compute the length of the 0th episode + episode_starts_transposed = ( + rollout_boundaries[:, 1] * T + rollout_boundaries[:, 0] + ) + # Need to sort so the above logic is correct + episode_starts_transposed, sorted_indices = torch.sort( + episode_starts_transposed, descending=False + ) + + # Calculate length of episode rollouts + rollout_lengths = ( + episode_starts_transposed[1:] - episode_starts_transposed[:-1] + ) + last_len = N * T - episode_starts_transposed[-1] + rollout_lengths = torch.cat([rollout_lengths, last_len.unsqueeze(0)]) + # Undo the sort above + rollout_lengths = rollout_lengths.index_select( + 0, _invert_permutation(sorted_indices) + ) + + # Resort in descending order of episode length + lengths, sorted_indices = torch.sort(rollout_lengths, descending=True) + + # We will want these on the CPU for torch.unique_consecutive, + # so move now. + cpu_lengths = lengths.to(device="cpu", non_blocking=True) + + episode_starts = episode_starts.index_select(0, sorted_indices) + select_inds = torch.empty((T * N), device=dones.device, dtype=torch.int64) + + max_length = int(cpu_lengths[0].item()) + # batch_sizes is *always* on the CPU + batch_sizes = torch.empty((max_length,), device="cpu", dtype=torch.long) + + offset = 0 + prev_len = 0 + num_valid_for_length = lengths.size(0) + + unique_lengths = torch.unique_consecutive(cpu_lengths) + # Iterate over all unique lengths in reverse as they sorted + # in decreasing order + for next_len in reversed(unique_lengths): + valids = lengths[0:num_valid_for_length] > prev_len + num_valid_for_length = int(valids.float().sum()) + + batch_sizes[prev_len:next_len] = num_valid_for_length + + # Creates this array + # [step * N + start for step in range(prev_len, next_len) + # for start in episode_starts[0:num_valid_for_length] + # * N because each step is seperated by N elements + new_inds = ( + torch.arange( + prev_len, next_len, device=episode_starts.device + ).view(next_len - prev_len, 1) + * N + + episode_starts[0:num_valid_for_length].view( + 1, num_valid_for_length + ) + ).view(-1) + + select_inds[offset : offset + new_inds.numel()] = new_inds + + offset += new_inds.numel() + + prev_len = next_len + + # Make sure we have an index for all elements + assert offset == T * N + + # This is used in conjunction with episode_starts to get + # the RNN hidden states + rnn_state_batch_inds = episode_starts % N + # This indicates that a given episode is the last one + # in that rollout. In other words, there are N places + # where this is True, and for each n, True indicates + # that this episode is the last contiguous block of experience, + # This is needed for getting the correct hidden states after + # the RNN forward pass + last_episode_in_batch_mask = ( + (episode_starts + (lengths - 1) * N) // N + ) == (T - 1) + + return ( + select_inds, + batch_sizes, + episode_starts, + rnn_state_batch_inds, + last_episode_in_batch_mask, + ) + + +def build_rnn_inputs( + x: torch.Tensor, not_dones: torch.Tensor, rnn_states: torch.Tensor +) -> Tuple[ + PackedSequence, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor +]: + r"""Create a PackedSequence input for an RNN such that each + set of steps that are part of the same episode are all part of + a batch in the PackedSequence. + + Use the returned select_inds and build_rnn_out_from_seq to invert this. + + :param x: A (T * N, -1) tensor of the data to build the PackedSequence out of + :param not_dones: A (T * N) tensor where not_dones[i] == False indicates an episode is done + :param rnn_states: A (-1, N, -1) tensor of the rnn_hidden_states + + :return: tuple(x_seq, rnn_states, select_inds, rnn_state_batch_inds, last_episode_in_batch_mask) + WHERE + x_seq is the PackedSequence version of x to pass to the RNN + + rnn_states are the corresponding rnn state + + select_inds can be passed to build_rnn_out_from_seq to retrieve the + RNN output + + rnn_state_batch_inds indicates which of the rollouts in the batch a hidden + state came from/is for + + last_episode_in_batch_mask indicates if an episode is the last in that batch. + There will be exactly N places where this is True + + """ + + N = rnn_states.size(1) + T = x.size(0) // N + dones = torch.logical_not(not_dones) + + ( + select_inds, + batch_sizes, + episode_starts, + rnn_state_batch_inds, + last_episode_in_batch_mask, + ) = _build_pack_info_from_dones(dones.detach().to(device="cpu"), T) + + select_inds = select_inds.to(device=x.device) + episode_starts = episode_starts.to(device=x.device) + rnn_state_batch_inds = rnn_state_batch_inds.to(device=x.device) + last_episode_in_batch_mask = last_episode_in_batch_mask.to(device=x.device) + + x_seq = PackedSequence( + x.index_select(0, select_inds), batch_sizes, None, None + ) + + # Just select the rnn_states by batch index, the masking bellow will set things + # to zero in the correct locations + rnn_states = rnn_states.index_select(1, rnn_state_batch_inds) + # Now zero things out in the correct locations + rnn_states = torch.where( + not_dones.view(1, -1, 1).index_select(1, episode_starts), + rnn_states, + rnn_states.new_zeros(()), + ) + + return ( + x_seq, + rnn_states, + select_inds, + rnn_state_batch_inds, + last_episode_in_batch_mask, + ) + + +def build_rnn_out_from_seq( + x_seq: PackedSequence, + hidden_states, + select_inds, + rnn_state_batch_inds, + last_episode_in_batch_mask, + N: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Construct the output of the rnn from a packed sequence returned by + forward propping an RNN on the packed sequence returned by :ref:`build_rnn_inputs`. + + :param x_seq: The packed sequence output from the rnn + :param hidden_statess: The hidden states output from the rnn + :param select_inds: Returned from :ref:`build_rnn_inputs` + :param rnn_state_batch_inds: Returned from :ref:`build_rnn_inputs` + :param last_episode_in_batch_mask: Returned from :ref:`build_rnn_inputs` + :param N: The number of simulator instances in the batch of experience. + """ + x = x_seq.data.index_select(0, _invert_permutation(select_inds)) + + last_hidden_states = torch.masked_select( + hidden_states, + last_episode_in_batch_mask.view(1, hidden_states.size(1), 1), + ).view(hidden_states.size(0), N, hidden_states.size(2)) + output_hidden_states = torch.empty_like(last_hidden_states) + scatter_inds = ( + torch.masked_select(rnn_state_batch_inds, last_episode_in_batch_mask) + .view(1, N, 1) + .expand_as(output_hidden_states) + ) + output_hidden_states.scatter_(1, scatter_inds, last_hidden_states) + + return x, output_hidden_states + + +class RNNStateEncoder(nn.Module): + r"""RNN encoder for use with RL and possibly IL. + + The main functionality this provides over just using PyTorch's RNN interface directly + is that it takes an addition masks input that resets the hidden state between two adjacent + timesteps to handle episodes ending in the middle of a rollout. + """ + + def layer_init(self): + for name, param in self.rnn.named_parameters(): + if "weight" in name: + nn.init.orthogonal_(param) + elif "bias" in name: + nn.init.constant_(param, 0) + + def pack_hidden(self, hidden_states: torch.Tensor) -> torch.Tensor: + return hidden_states + + def unpack_hidden(self, hidden_states: torch.Tensor) -> torch.Tensor: + return hidden_states + + def single_forward( + self, x, hidden_states, masks + ) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Forward for a non-sequence input""" + + hidden_states = torch.where( + masks.view(1, -1, 1), hidden_states, hidden_states.new_zeros(()) + ) + + x, hidden_states = self.rnn( + x.unsqueeze(0), self.unpack_hidden(hidden_states) + ) + hidden_states = self.pack_hidden(hidden_states) + + x = x.squeeze(0) + return x, hidden_states + + def seq_forward( + self, x, hidden_states, masks + ) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Forward for a sequence of length T + + Args: + x: (T, N, -1) Tensor that has been flattened to (T * N, -1) + hidden_states: The starting hidden state. + masks: The masks to be applied to hidden state at every timestep. + A (T, N) tensor flatten to (T * N) + """ + N = hidden_states.size(1) + + ( + x_seq, + hidden_states, + select_inds, + rnn_state_batch_inds, + last_episode_in_batch_mask, + ) = build_rnn_inputs(x, masks, hidden_states) + + x_seq, hidden_states = self.rnn( + x_seq, self.unpack_hidden(hidden_states) + ) + hidden_states = self.pack_hidden(hidden_states) + + x, hidden_states = build_rnn_out_from_seq( + x_seq, + hidden_states, + select_inds, + rnn_state_batch_inds, + last_episode_in_batch_mask, + N, + ) + + return x, hidden_states + + def forward( + self, x, hidden_states, masks + ) -> Tuple[torch.Tensor, torch.Tensor]: + hidden_states = hidden_states.permute(1, 0, 2) + if x.size(0) == hidden_states.size(1): + x, hidden_states = self.single_forward(x, hidden_states, masks) + else: + x, hidden_states = self.seq_forward(x, hidden_states, masks) + + hidden_states = hidden_states.permute(1, 0, 2) + + return x, hidden_states + + +class LSTMStateEncoder(RNNStateEncoder): + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + ): + super().__init__() + + self.num_recurrent_layers = num_layers * 2 + + self.rnn = nn.LSTM( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + ) + + self.layer_init() + + def pack_hidden( + self, hidden_states: Tuple[torch.Tensor, torch.Tensor] + ) -> torch.Tensor: + return torch.cat(hidden_states, 0) + + def unpack_hidden( + self, hidden_states + ) -> Tuple[torch.Tensor, torch.Tensor]: + lstm_states = torch.chunk(hidden_states, 2, 0) + return (lstm_states[0], lstm_states[1]) + + +class GRUStateEncoder(RNNStateEncoder): + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + ): + super().__init__() + + self.num_recurrent_layers = num_layers + + self.rnn = nn.GRU( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + ) + + self.layer_init() + + +def build_rnn_state_encoder( + input_size: int, + hidden_size: int, + rnn_type: str = "GRU", + num_layers: int = 1, +): + r"""Factory for :ref:`RNNStateEncoder`. Returns one with either a GRU or LSTM based on + the specified RNN type. + + :param input_size: The input size of the RNN + :param hidden_size: The hidden dimension of the RNN + :param rnn_types: The type of the RNN cell. Can either be GRU or LSTM + :param num_layers: The number of RNN layers. + """ + rnn_type = rnn_type.lower() + if rnn_type == "gru": + return GRUStateEncoder(input_size, hidden_size, num_layers) + elif rnn_type == "lstm": + return LSTMStateEncoder(input_size, hidden_size, num_layers) + else: + raise RuntimeError(f"Did not recognize rnn type '{rnn_type}'") diff --git a/habitat-lab-dialog/habitat_baselines/rl/models/simple_cnn.py b/habitat-lab-dialog/habitat_baselines/rl/models/simple_cnn.py new file mode 100644 index 0000000..dcfeec1 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/models/simple_cnn.py @@ -0,0 +1,154 @@ +from typing import Dict + +import numpy as np +import torch +from torch import nn as nn + + +class SimpleCNN(nn.Module): + r"""A Simple 3-Conv CNN followed by a fully connected layer + + Takes in observations and produces an embedding of the rgb and/or depth components + + Args: + observation_space: The observation_space of the agent + output_size: The size of the embedding vector + """ + + def __init__( + self, + observation_space, + output_size, + ): + super().__init__() + + if "rgb" in observation_space.spaces: + self._n_input_rgb = observation_space.spaces["rgb"].shape[2] + else: + self._n_input_rgb = 0 + + if "depth" in observation_space.spaces: + self._n_input_depth = observation_space.spaces["depth"].shape[2] + else: + self._n_input_depth = 0 + + # kernel size for different CNN layers + self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] + + # strides for different CNN layers + self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] + + if self._n_input_rgb > 0: + cnn_dims = np.array( + observation_space.spaces["rgb"].shape[:2], dtype=np.float32 + ) + elif self._n_input_depth > 0: + cnn_dims = np.array( + observation_space.spaces["depth"].shape[:2], dtype=np.float32 + ) + + if self.is_blind: + self.cnn = nn.Sequential() + else: + for kernel_size, stride in zip( + self._cnn_layers_kernel_size, self._cnn_layers_stride + ): + cnn_dims = self._conv_output_dim( + dimension=cnn_dims, + padding=np.array([0, 0], dtype=np.float32), + dilation=np.array([1, 1], dtype=np.float32), + kernel_size=np.array(kernel_size, dtype=np.float32), + stride=np.array(stride, dtype=np.float32), + ) + + self.cnn = nn.Sequential( + nn.Conv2d( + in_channels=self._n_input_rgb + self._n_input_depth, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[0], + stride=self._cnn_layers_stride[0], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=32, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[1], + stride=self._cnn_layers_stride[1], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=64, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[2], + stride=self._cnn_layers_stride[2], + ), + # nn.ReLU(True), + nn.Flatten(), + nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), + nn.ReLU(True), + ) + + self.layer_init() + + def _conv_output_dim( + self, dimension, padding, dilation, kernel_size, stride + ): + r"""Calculates the output height and width based on the input + height and width to the convolution layer. + + ref: https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d + """ + assert len(dimension) == 2 + out_dimension = [] + for i in range(len(dimension)): + out_dimension.append( + int( + np.floor( + ( + ( + dimension[i] + + 2 * padding[i] + - dilation[i] * (kernel_size[i] - 1) + - 1 + ) + / stride[i] + ) + + 1 + ) + ) + ) + return tuple(out_dimension) + + def layer_init(self): + for layer in self.cnn: # type: ignore + if isinstance(layer, (nn.Conv2d, nn.Linear)): + nn.init.kaiming_normal_( + layer.weight, nn.init.calculate_gain("relu") + ) + if layer.bias is not None: + nn.init.constant_(layer.bias, val=0) + + @property + def is_blind(self): + return self._n_input_rgb + self._n_input_depth == 0 + + def forward(self, observations: Dict[str, torch.Tensor]): + cnn_input = [] + if self._n_input_rgb > 0: + rgb_observations = observations["rgb"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + rgb_observations = rgb_observations.permute(0, 3, 1, 2) + rgb_observations = ( + rgb_observations.float() / 255.0 + ) # normalize RGB + cnn_input.append(rgb_observations) + + if self._n_input_depth > 0: + depth_observations = observations["depth"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + depth_observations = depth_observations.permute(0, 3, 1, 2) + cnn_input.append(depth_observations) + + cnn_inputs = torch.cat(cnn_input, dim=1) + + return self.cnn(cnn_inputs) diff --git a/habitat-lab-dialog/habitat_baselines/rl/ppo/__init__.py b/habitat-lab-dialog/habitat_baselines/rl/ppo/__init__.py new file mode 100644 index 0000000..d4fbfef --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ppo/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat_baselines.rl.ppo.policy import Net, PointNavBaselinePolicy, Policy +from habitat_baselines.rl.ppo.ppo import PPO +from habitat_baselines.rl.ppo.ppo_trainer import RolloutStorage + +__all__ = ["PPO", "Policy", "RolloutStorage", "Net", "PointNavBaselinePolicy"] diff --git a/habitat-lab-dialog/habitat_baselines/rl/ppo/policy.py b/habitat-lab-dialog/habitat_baselines/rl/ppo/policy.py new file mode 100644 index 0000000..f3e4588 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ppo/policy.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import abc + +import torch +from gym import spaces +from torch import nn as nn + +from habitat.config import Config +from habitat.tasks.nav.nav import ( + ImageGoalSensor, + IntegratedPointGoalGPSAndCompassSensor, + PointGoalSensor, +) +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.rl.models.rnn_state_encoder import ( + build_rnn_state_encoder, +) +from habitat_baselines.rl.models.simple_cnn import SimpleCNN +from habitat_baselines.utils.common import CategoricalNet + + +class Policy(nn.Module, metaclass=abc.ABCMeta): + def __init__(self, net, dim_actions): + super().__init__() + self.net = net + self.dim_actions = dim_actions + + self.action_distribution = CategoricalNet( + self.net.output_size, self.dim_actions + ) + self.critic = CriticHead(self.net.output_size) + + def forward(self, *x): + raise NotImplementedError + + def act( + self, + observations, + rnn_hidden_states, + prev_actions, + masks, + deterministic=False, + ): + features, rnn_hidden_states = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + distribution = self.action_distribution(features) + value = self.critic(features) + + if deterministic: + action = distribution.mode() + else: + action = distribution.sample() + + action_log_probs = distribution.log_probs(action) + + return value, action, action_log_probs, rnn_hidden_states + + def get_value(self, observations, rnn_hidden_states, prev_actions, masks): + features, _ = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + return self.critic(features) + + def evaluate_actions( + self, observations, rnn_hidden_states, prev_actions, masks, action + ): + features, rnn_hidden_states = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + distribution = self.action_distribution(features) + value = self.critic(features) + + action_log_probs = distribution.log_probs(action) + distribution_entropy = distribution.entropy() + + return value, action_log_probs, distribution_entropy, rnn_hidden_states + + @classmethod + @abc.abstractmethod + def from_config(cls, config, observation_space, action_space): + pass + + +class CriticHead(nn.Module): + def __init__(self, input_size): + super().__init__() + self.fc = nn.Linear(input_size, 1) + nn.init.orthogonal_(self.fc.weight) + nn.init.constant_(self.fc.bias, 0) + + def forward(self, x): + return self.fc(x) + + +@baseline_registry.register_policy +class PointNavBaselinePolicy(Policy): + def __init__( + self, + observation_space: spaces.Dict, + action_space, + hidden_size: int = 512, + **kwargs + ): + super().__init__( + PointNavBaselineNet( # type: ignore + observation_space=observation_space, + hidden_size=hidden_size, + **kwargs, + ), + action_space.n, + ) + + @classmethod + def from_config( + cls, config: Config, observation_space: spaces.Dict, action_space + ): + return cls( + observation_space=observation_space, + action_space=action_space, + hidden_size=config.RL.PPO.hidden_size, + ) + + +class Net(nn.Module, metaclass=abc.ABCMeta): + @abc.abstractmethod + def forward(self, observations, rnn_hidden_states, prev_actions, masks): + pass + + @property + @abc.abstractmethod + def output_size(self): + pass + + @property + @abc.abstractmethod + def num_recurrent_layers(self): + pass + + @property + @abc.abstractmethod + def is_blind(self): + pass + + +class PointNavBaselineNet(Net): + r"""Network which passes the input image through CNN and concatenates + goal vector with CNN's output and passes that through RNN. + """ + + def __init__( + self, + observation_space: spaces.Dict, + hidden_size: int, + ): + super().__init__() + + if ( + IntegratedPointGoalGPSAndCompassSensor.cls_uuid + in observation_space.spaces + ): + self._n_input_goal = observation_space.spaces[ + IntegratedPointGoalGPSAndCompassSensor.cls_uuid + ].shape[0] + elif PointGoalSensor.cls_uuid in observation_space.spaces: + self._n_input_goal = observation_space.spaces[ + PointGoalSensor.cls_uuid + ].shape[0] + elif ImageGoalSensor.cls_uuid in observation_space.spaces: + goal_observation_space = spaces.Dict( + {"rgb": observation_space.spaces[ImageGoalSensor.cls_uuid]} + ) + self.goal_visual_encoder = SimpleCNN( + goal_observation_space, hidden_size + ) + self._n_input_goal = hidden_size + + self._hidden_size = hidden_size + + self.visual_encoder = SimpleCNN(observation_space, hidden_size) + + self.state_encoder = build_rnn_state_encoder( + (0 if self.is_blind else self._hidden_size) + self._n_input_goal, + self._hidden_size, + ) + + self.train() + + @property + def output_size(self): + return self._hidden_size + + @property + def is_blind(self): + return self.visual_encoder.is_blind + + @property + def num_recurrent_layers(self): + return self.state_encoder.num_recurrent_layers + + def forward(self, observations, rnn_hidden_states, prev_actions, masks): + if IntegratedPointGoalGPSAndCompassSensor.cls_uuid in observations: + target_encoding = observations[ + IntegratedPointGoalGPSAndCompassSensor.cls_uuid + ] + + elif PointGoalSensor.cls_uuid in observations: + target_encoding = observations[PointGoalSensor.cls_uuid] + elif ImageGoalSensor.cls_uuid in observations: + image_goal = observations[ImageGoalSensor.cls_uuid] + target_encoding = self.goal_visual_encoder({"rgb": image_goal}) + + x = [target_encoding] + + if not self.is_blind: + perception_embed = self.visual_encoder(observations) + x = [perception_embed] + x + + x_out = torch.cat(x, dim=1) + x_out, rnn_hidden_states = self.state_encoder( + x_out, rnn_hidden_states, masks + ) + + return x_out, rnn_hidden_states diff --git a/habitat-lab-dialog/habitat_baselines/rl/ppo/ppo.py b/habitat-lab-dialog/habitat_baselines/rl/ppo/ppo.py new file mode 100644 index 0000000..9865efd --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ppo/ppo.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple + +import torch +from torch import Tensor +from torch import nn as nn +from torch import optim as optim + +from habitat.utils import profiling_wrapper +from habitat_baselines.common.rollout_storage import RolloutStorage +from habitat_baselines.rl.ppo.policy import Policy + +EPS_PPO = 1e-5 + + +class PPO(nn.Module): + def __init__( + self, + actor_critic: Policy, + clip_param: float, + ppo_epoch: int, + num_mini_batch: int, + value_loss_coef: float, + entropy_coef: float, + lr: Optional[float] = None, + eps: Optional[float] = None, + max_grad_norm: Optional[float] = None, + use_clipped_value_loss: bool = True, + use_normalized_advantage: bool = True, + ) -> None: + + super().__init__() + + self.actor_critic = actor_critic + + self.clip_param = clip_param + self.ppo_epoch = ppo_epoch + self.num_mini_batch = num_mini_batch + + self.value_loss_coef = value_loss_coef + self.entropy_coef = entropy_coef + + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + self.optimizer = optim.Adam( + list(filter(lambda p: p.requires_grad, actor_critic.parameters())), + lr=lr, + eps=eps, + ) + self.device = next(actor_critic.parameters()).device + self.use_normalized_advantage = use_normalized_advantage + + def forward(self, *x): + raise NotImplementedError + + def get_advantages(self, rollouts: RolloutStorage) -> Tensor: + advantages = ( + rollouts.buffers["returns"][:-1] + - rollouts.buffers["value_preds"][:-1] + ) + if not self.use_normalized_advantage: + return advantages + + return (advantages - advantages.mean()) / (advantages.std() + EPS_PPO) + + def update(self, rollouts: RolloutStorage) -> Tuple[float, float, float]: + advantages = self.get_advantages(rollouts) + + value_loss_epoch = 0.0 + action_loss_epoch = 0.0 + dist_entropy_epoch = 0.0 + + for _e in range(self.ppo_epoch): + profiling_wrapper.range_push("PPO.update epoch") + data_generator = rollouts.recurrent_generator( + advantages, self.num_mini_batch + ) + + for batch in data_generator: + ( + values, + action_log_probs, + dist_entropy, + _, + ) = self._evaluate_actions( + batch["observations"], + batch["recurrent_hidden_states"], + batch["prev_actions"], + batch["masks"], + batch["actions"], + ) + + ratio = torch.exp(action_log_probs - batch["action_log_probs"]) + surr1 = ratio * batch["advantages"] + surr2 = ( + torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + * batch["advantages"] + ) + action_loss = -(torch.min(surr1, surr2).mean()) + + if self.use_clipped_value_loss: + value_pred_clipped = batch["value_preds"] + ( + values - batch["value_preds"] + ).clamp(-self.clip_param, self.clip_param) + value_losses = (values - batch["returns"]).pow(2) + value_losses_clipped = ( + value_pred_clipped - batch["returns"] + ).pow(2) + value_loss = 0.5 * torch.max( + value_losses, value_losses_clipped + ) + else: + value_loss = 0.5 * (batch["returns"] - values).pow(2) + + value_loss = value_loss.mean() + dist_entropy = dist_entropy.mean() + + self.optimizer.zero_grad() + total_loss = ( + value_loss * self.value_loss_coef + + action_loss + - dist_entropy * self.entropy_coef + ) + + self.before_backward(total_loss) + total_loss.backward() + self.after_backward(total_loss) + + self.before_step() + self.optimizer.step() + self.after_step() + + value_loss_epoch += value_loss.item() + action_loss_epoch += action_loss.item() + dist_entropy_epoch += dist_entropy.item() + + profiling_wrapper.range_pop() # PPO.update epoch + + num_updates = self.ppo_epoch * self.num_mini_batch + + value_loss_epoch /= num_updates + action_loss_epoch /= num_updates + dist_entropy_epoch /= num_updates + + return value_loss_epoch, action_loss_epoch, dist_entropy_epoch + + def _evaluate_actions( + self, observations, rnn_hidden_states, prev_actions, masks, action + ): + r"""Internal method that calls Policy.evaluate_actions. This is used instead of calling + that directly so that that call can be overrided with inheritence + """ + return self.actor_critic.evaluate_actions( + observations, rnn_hidden_states, prev_actions, masks, action + ) + + def before_backward(self, loss: Tensor) -> None: + pass + + def after_backward(self, loss: Tensor) -> None: + pass + + def before_step(self) -> None: + nn.utils.clip_grad_norm_( + self.actor_critic.parameters(), self.max_grad_norm + ) + + def after_step(self) -> None: + pass diff --git a/habitat-lab-dialog/habitat_baselines/rl/ppo/ppo_trainer.py b/habitat-lab-dialog/habitat_baselines/rl/ppo/ppo_trainer.py new file mode 100644 index 0000000..a515862 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/ppo/ppo_trainer.py @@ -0,0 +1,1062 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import contextlib +import os +import random +import time +from collections import defaultdict, deque +from typing import Any, Dict, List, Optional + +import numpy as np +import torch +import tqdm +from gym import spaces +from torch import nn +from torch.optim.lr_scheduler import LambdaLR + +from habitat import Config, VectorEnv, logger +from habitat.utils import profiling_wrapper +from habitat.utils.visualizations.utils import observations_to_image +from habitat_baselines.common.base_trainer import BaseRLTrainer +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.common.environments import get_env_class +from habitat_baselines.common.obs_transformers import ( + apply_obs_transforms_batch, + apply_obs_transforms_obs_space, + get_active_obs_transforms, +) +from habitat_baselines.common.rollout_storage import RolloutStorage +from habitat_baselines.common.tensorboard_utils import TensorboardWriter +from habitat_baselines.rl.ddppo.algo import DDPPO +from habitat_baselines.rl.ddppo.algo.ddp_utils import ( + EXIT, + REQUEUE, + add_signal_handlers, + get_distrib_size, + init_distrib_slurm, + is_slurm_batch_job, + load_interrupted_state, + rank0_only, + requeue_job, + save_interrupted_state, +) +from habitat_baselines.rl.ppo import PPO +from habitat_baselines.rl.ppo.policy import Policy +from habitat_baselines.utils.common import batch_obs, generate_video +from habitat_baselines.utils.env_utils import construct_envs + + +@baseline_registry.register_trainer(name="ddppo") +@baseline_registry.register_trainer(name="ppo") +class PPOTrainer(BaseRLTrainer): + r"""Trainer class for PPO algorithm + Paper: https://arxiv.org/abs/1707.06347. + """ + supported_tasks = ["Nav-v0"] + + SHORT_ROLLOUT_THRESHOLD: float = 0.25 + _is_distributed: bool + envs: VectorEnv + agent: PPO + actor_critic: Policy + + def __init__(self, config=None): + interrupted_state = load_interrupted_state() + if interrupted_state is not None: + config = interrupted_state["config"] + + super().__init__(config) + self.actor_critic = None + self.agent = None + self.envs = None + self.obs_transforms = [] + + self._static_encoder = False + self._encoder = None + self._obs_space = None + + # Distirbuted if the world size would be + # greater than 1 + self._is_distributed = get_distrib_size()[2] > 1 + + @property + def obs_space(self): + if self._obs_space is None and self.envs is not None: + self._obs_space = self.envs.observation_spaces[0] + + return self._obs_space + + @obs_space.setter + def obs_space(self, new_obs_space): + self._obs_space = new_obs_space + + def _all_reduce(self, t: torch.Tensor) -> torch.Tensor: + r"""All reduce helper method that moves things to the correct + device and only runs if distributed + """ + if not self._is_distributed: + return t + + orig_device = t.device + t = t.to(device=self.device) + torch.distributed.all_reduce(t) + + return t.to(device=orig_device) + + def _setup_actor_critic_agent(self, ppo_cfg: Config) -> None: + r"""Sets up actor critic and agent for PPO. + + Args: + ppo_cfg: config node with relevant params + + Returns: + None + """ + logger.add_filehandler(self.config.LOG_FILE) + + policy = baseline_registry.get_policy(self.config.RL.POLICY.name) + observation_space = self.obs_space + self.obs_transforms = get_active_obs_transforms(self.config) + observation_space = apply_obs_transforms_obs_space( + observation_space, self.obs_transforms + ) + self.actor_critic = policy.from_config( + self.config, observation_space, self.envs.action_spaces[0] + ) + self.obs_space = observation_space + self.actor_critic.to(self.device) + + if ( + self.config.RL.DDPPO.pretrained_encoder + or self.config.RL.DDPPO.pretrained + ): + pretrained_state = torch.load( + self.config.RL.DDPPO.pretrained_weights, map_location="cpu" + ) + + if self.config.RL.DDPPO.pretrained: + self.actor_critic.load_state_dict( + { + k[len("actor_critic.") :]: v + for k, v in pretrained_state["state_dict"].items() + } + ) + elif self.config.RL.DDPPO.pretrained_encoder: + prefix = "actor_critic.net.visual_encoder." + self.actor_critic.net.visual_encoder.load_state_dict( + { + k[len(prefix) :]: v + for k, v in pretrained_state["state_dict"].items() + if k.startswith(prefix) + } + ) + + if not self.config.RL.DDPPO.train_encoder: + self._static_encoder = True + for param in self.actor_critic.net.visual_encoder.parameters(): + param.requires_grad_(False) + + if self.config.RL.DDPPO.reset_critic: + nn.init.orthogonal_(self.actor_critic.critic.fc.weight) + nn.init.constant_(self.actor_critic.critic.fc.bias, 0) + + self.agent = (DDPPO if self._is_distributed else PPO)( + actor_critic=self.actor_critic, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + use_normalized_advantage=ppo_cfg.use_normalized_advantage, + ) + + def _init_envs(self, config=None): + if config is None: + config = self.config + + self.envs = construct_envs( + config, + get_env_class(config.ENV_NAME), + workers_ignore_signals=is_slurm_batch_job(), + ) + + def _init_train(self): + if self.config.RL.DDPPO.force_distributed: + self._is_distributed = True + + if is_slurm_batch_job(): + add_signal_handlers() + + if self._is_distributed: + local_rank, tcp_store = init_distrib_slurm( + self.config.RL.DDPPO.distrib_backend + ) + if rank0_only(): + logger.info( + "Initialized DD-PPO with {} workers".format( + torch.distributed.get_world_size() + ) + ) + + self.config.defrost() + self.config.TORCH_GPU_ID = local_rank + self.config.SIMULATOR_GPU_ID = local_rank + # Multiply by the number of simulators to make sure they also get unique seeds + self.config.TASK_CONFIG.SEED += ( + torch.distributed.get_world_size() + * self.config.NUM_ENVIRONMENTS + ) + self.config.freeze() + + random.seed(self.config.TASK_CONFIG.SEED) + np.random.seed(self.config.TASK_CONFIG.SEED) + torch.manual_seed(self.config.TASK_CONFIG.SEED) + self.num_rollouts_done_store = torch.distributed.PrefixStore( + "rollout_tracker", tcp_store + ) + self.num_rollouts_done_store.set("num_done", "0") + + if rank0_only() and self.config.VERBOSE: + logger.info(f"config: {self.config}") + + profiling_wrapper.configure( + capture_start_step=self.config.PROFILING.CAPTURE_START_STEP, + num_steps_to_capture=self.config.PROFILING.NUM_STEPS_TO_CAPTURE, + ) + + self._init_envs() + + ppo_cfg = self.config.RL.PPO + if torch.cuda.is_available(): + self.device = torch.device("cuda", self.config.TORCH_GPU_ID) + torch.cuda.set_device(self.device) + else: + self.device = torch.device("cpu") + + if rank0_only() and not os.path.isdir(self.config.CHECKPOINT_FOLDER): + os.makedirs(self.config.CHECKPOINT_FOLDER) + + self._setup_actor_critic_agent(ppo_cfg) + if self._is_distributed: + self.agent.init_distributed(find_unused_params=True) + + logger.info( + "agent number of parameters: {}".format( + sum(param.numel() for param in self.agent.parameters()) + ) + ) + + obs_space = self.obs_space + if self._static_encoder: + self._encoder = self.actor_critic.net.visual_encoder + obs_space = spaces.Dict( + { + "visual_features": spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=self._encoder.output_shape, + dtype=np.float32, + ), + **obs_space.spaces, + } + ) + + self._nbuffers = 2 if ppo_cfg.use_double_buffered_sampler else 1 + self.rollouts = RolloutStorage( + ppo_cfg.num_steps, + self.envs.num_envs, + obs_space, + self.envs.action_spaces[0], + ppo_cfg.hidden_size, + num_recurrent_layers=self.actor_critic.net.num_recurrent_layers, + is_double_buffered=ppo_cfg.use_double_buffered_sampler, + ) + self.rollouts.to(self.device) + + observations = self.envs.reset() + batch = batch_obs(observations, device=self.device) + batch = apply_obs_transforms_batch(batch, self.obs_transforms) + + if self._static_encoder: + with torch.no_grad(): + batch["visual_features"] = self._encoder(batch) + + self.rollouts.buffers["observations"][0] = batch + + self.current_episode_reward = torch.zeros(self.envs.num_envs, 1) + self.running_episode_stats = dict( + count=torch.zeros(self.envs.num_envs, 1), + reward=torch.zeros(self.envs.num_envs, 1), + ) + self.window_episode_stats = defaultdict( + lambda: deque(maxlen=ppo_cfg.reward_window_size) + ) + + self.env_time = 0.0 + self.pth_time = 0.0 + self.t_start = time.time() + + @rank0_only + @profiling_wrapper.RangeContext("save_checkpoint") + def save_checkpoint( + self, file_name: str, extra_state: Optional[Dict] = None + ) -> None: + r"""Save checkpoint with specified name. + + Args: + file_name: file name for checkpoint + + Returns: + None + """ + checkpoint = { + "state_dict": self.agent.state_dict(), + "config": self.config, + } + if extra_state is not None: + checkpoint["extra_state"] = extra_state + + torch.save( + checkpoint, os.path.join(self.config.CHECKPOINT_FOLDER, file_name) + ) + + def load_checkpoint(self, checkpoint_path: str, *args, **kwargs) -> Dict: + r"""Load checkpoint of specified path as a dict. + + Args: + checkpoint_path: path of target checkpoint + *args: additional positional args + **kwargs: additional keyword args + + Returns: + dict containing checkpoint info + """ + return torch.load(checkpoint_path, *args, **kwargs) + + METRICS_BLACKLIST = {"top_down_map", "collisions.is_collision"} + + @classmethod + def _extract_scalars_from_info( + cls, info: Dict[str, Any] + ) -> Dict[str, float]: + result = {} + for k, v in info.items(): + if k in cls.METRICS_BLACKLIST: + continue + + if isinstance(v, dict): + result.update( + { + k + "." + subk: subv + for subk, subv in cls._extract_scalars_from_info( + v + ).items() + if (k + "." + subk) not in cls.METRICS_BLACKLIST + } + ) + # Things that are scalar-like will have an np.size of 1. + # Strings also have an np.size of 1, so explicitly ban those + elif np.size(v) == 1 and not isinstance(v, str): + result[k] = float(v) + + return result + + @classmethod + def _extract_scalars_from_infos( + cls, infos: List[Dict[str, Any]] + ) -> Dict[str, List[float]]: + + results = defaultdict(list) + for i in range(len(infos)): + for k, v in cls._extract_scalars_from_info(infos[i]).items(): + results[k].append(v) + + return results + + def _compute_actions_and_step_envs(self, buffer_index: int = 0): + num_envs = self.envs.num_envs + env_slice = slice( + int(buffer_index * num_envs / self._nbuffers), + int((buffer_index + 1) * num_envs / self._nbuffers), + ) + + t_sample_action = time.time() + + # sample actions + with torch.no_grad(): + step_batch = self.rollouts.buffers[ + self.rollouts.current_rollout_step_idxs[buffer_index], + env_slice, + ] + + profiling_wrapper.range_push("compute actions") + ( + values, + actions, + actions_log_probs, + recurrent_hidden_states, + ) = self.actor_critic.act( + step_batch["observations"], + step_batch["recurrent_hidden_states"], + step_batch["prev_actions"], + step_batch["masks"], + ) + + # NB: Move actions to CPU. If CUDA tensors are + # sent in to env.step(), that will create CUDA contexts + # in the subprocesses. + # For backwards compatibility, we also call .item() to convert to + # an int + actions = actions.to(device="cpu") + self.pth_time += time.time() - t_sample_action + + profiling_wrapper.range_pop() # compute actions + + t_step_env = time.time() + + for index_env, act in zip( + range(env_slice.start, env_slice.stop), actions.unbind(0) + ): + self.envs.async_step_at(index_env, act.item()) + + self.env_time += time.time() - t_step_env + + self.rollouts.insert( + next_recurrent_hidden_states=recurrent_hidden_states, + actions=actions, + action_log_probs=actions_log_probs, + value_preds=values, + buffer_index=buffer_index, + ) + + def _collect_environment_result(self, buffer_index: int = 0): + num_envs = self.envs.num_envs + env_slice = slice( + int(buffer_index * num_envs / self._nbuffers), + int((buffer_index + 1) * num_envs / self._nbuffers), + ) + + t_step_env = time.time() + outputs = [ + self.envs.wait_step_at(index_env) + for index_env in range(env_slice.start, env_slice.stop) + ] + + observations, rewards_l, dones, infos = [ + list(x) for x in zip(*outputs) + ] + + self.env_time += time.time() - t_step_env + + t_update_stats = time.time() + batch = batch_obs(observations, device=self.device) + batch = apply_obs_transforms_batch(batch, self.obs_transforms) + + rewards = torch.tensor( + rewards_l, + dtype=torch.float, + device=self.current_episode_reward.device, + ) + rewards = rewards.unsqueeze(1) + + not_done_masks = torch.tensor( + [[not done] for done in dones], + dtype=torch.bool, + device=self.current_episode_reward.device, + ) + done_masks = torch.logical_not(not_done_masks) + + self.current_episode_reward[env_slice] += rewards + current_ep_reward = self.current_episode_reward[env_slice] + self.running_episode_stats["reward"][env_slice] += current_ep_reward.where(done_masks, current_ep_reward.new_zeros(())) # type: ignore + self.running_episode_stats["count"][env_slice] += done_masks.float() # type: ignore + for k, v_k in self._extract_scalars_from_infos(infos).items(): + v = torch.tensor( + v_k, + dtype=torch.float, + device=self.current_episode_reward.device, + ).unsqueeze(1) + if k not in self.running_episode_stats: + self.running_episode_stats[k] = torch.zeros_like( + self.running_episode_stats["count"] + ) + + self.running_episode_stats[k][env_slice] += v.where(done_masks, v.new_zeros(())) # type: ignore + + self.current_episode_reward[env_slice].masked_fill_(done_masks, 0.0) + + if self._static_encoder: + with torch.no_grad(): + batch["visual_features"] = self._encoder(batch) + + self.rollouts.insert( + next_observations=batch, + rewards=rewards, + next_masks=not_done_masks, + buffer_index=buffer_index, + ) + + self.rollouts.advance_rollout(buffer_index) + + self.pth_time += time.time() - t_update_stats + + return env_slice.stop - env_slice.start + + @profiling_wrapper.RangeContext("_collect_rollout_step") + def _collect_rollout_step(self): + self._compute_actions_and_step_envs() + return self._collect_environment_result() + + @profiling_wrapper.RangeContext("_update_agent") + def _update_agent(self): + ppo_cfg = self.config.RL.PPO + t_update_model = time.time() + with torch.no_grad(): + step_batch = self.rollouts.buffers[ + self.rollouts.current_rollout_step_idx + ] + + next_value = self.actor_critic.get_value( + step_batch["observations"], + step_batch["recurrent_hidden_states"], + step_batch["prev_actions"], + step_batch["masks"], + ) + + self.rollouts.compute_returns( + next_value, ppo_cfg.use_gae, ppo_cfg.gamma, ppo_cfg.tau + ) + + self.agent.train() + + value_loss, action_loss, dist_entropy = self.agent.update( + self.rollouts + ) + + self.rollouts.after_update() + self.pth_time += time.time() - t_update_model + + return ( + value_loss, + action_loss, + dist_entropy, + ) + + def _coalesce_post_step( + self, losses: Dict[str, float], count_steps_delta: int + ) -> Dict[str, float]: + stats_ordering = sorted(self.running_episode_stats.keys()) + stats = torch.stack( + [self.running_episode_stats[k] for k in stats_ordering], 0 + ) + + stats = self._all_reduce(stats) + + for i, k in enumerate(stats_ordering): + self.window_episode_stats[k].append(stats[i]) + + if self._is_distributed: + loss_name_ordering = sorted(losses.keys()) + stats = torch.tensor( + [losses[k] for k in loss_name_ordering] + [count_steps_delta], + device="cpu", + dtype=torch.float32, + ) + stats = self._all_reduce(stats) + count_steps_delta = int(stats[-1].item()) + stats /= torch.distributed.get_world_size() + + losses = { + k: stats[i].item() for i, k in enumerate(loss_name_ordering) + } + + if self._is_distributed and rank0_only(): + self.num_rollouts_done_store.set("num_done", "0") + + self.num_steps_done += count_steps_delta + + return losses + + @rank0_only + def _training_log( + self, writer, losses: Dict[str, float], prev_time: int = 0 + ): + deltas = { + k: ( + (v[-1] - v[0]).sum().item() + if len(v) > 1 + else v[0].sum().item() + ) + for k, v in self.window_episode_stats.items() + } + deltas["count"] = max(deltas["count"], 1.0) + + writer.add_scalar( + "reward", + deltas["reward"] / deltas["count"], + self.num_steps_done, + ) + + # Check to see if there are any metrics + # that haven't been logged yet + metrics = { + k: v / deltas["count"] + for k, v in deltas.items() + if k not in {"reward", "count"} + } + if len(metrics) > 0: + writer.add_scalars("metrics", metrics, self.num_steps_done) + + writer.add_scalars( + "losses", + losses, + self.num_steps_done, + ) + + # log stats + if self.num_updates_done % self.config.LOG_INTERVAL == 0: + logger.info( + "update: {}\tfps: {:.3f}\t".format( + self.num_updates_done, + self.num_steps_done + / ((time.time() - self.t_start) + prev_time), + ) + ) + + logger.info( + "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" + "frames: {}".format( + self.num_updates_done, + self.env_time, + self.pth_time, + self.num_steps_done, + ) + ) + + logger.info( + "Average window size: {} {}".format( + len(self.window_episode_stats["count"]), + " ".join( + "{}: {:.3f}".format(k, v / deltas["count"]) + for k, v in deltas.items() + if k != "count" + ), + ) + ) + + def should_end_early(self, rollout_step) -> bool: + if not self._is_distributed: + return False + # This is where the preemption of workers happens. If a + # worker detects it will be a straggler, it preempts itself! + return ( + rollout_step + >= self.config.RL.PPO.num_steps * self.SHORT_ROLLOUT_THRESHOLD + ) and int(self.num_rollouts_done_store.get("num_done")) >= ( + self.config.RL.DDPPO.sync_frac * torch.distributed.get_world_size() + ) + + @profiling_wrapper.RangeContext("train") + def train(self) -> None: + r"""Main method for training DD/PPO. + + Returns: + None + """ + + self._init_train() + + count_checkpoints = 0 + prev_time = 0 + + lr_scheduler = LambdaLR( + optimizer=self.agent.optimizer, + lr_lambda=lambda x: 1 - self.percent_done(), + ) + + interrupted_state = load_interrupted_state() + if interrupted_state is not None: + self.agent.load_state_dict(interrupted_state["state_dict"]) + self.agent.optimizer.load_state_dict( + interrupted_state["optim_state"] + ) + lr_scheduler.load_state_dict(interrupted_state["lr_sched_state"]) + + requeue_stats = interrupted_state["requeue_stats"] + self.env_time = requeue_stats["env_time"] + self.pth_time = requeue_stats["pth_time"] + self.num_steps_done = requeue_stats["num_steps_done"] + self.num_updates_done = requeue_stats["num_updates_done"] + self._last_checkpoint_percent = requeue_stats[ + "_last_checkpoint_percent" + ] + count_checkpoints = requeue_stats["count_checkpoints"] + prev_time = requeue_stats["prev_time"] + + self._last_checkpoint_percent = requeue_stats[ + "_last_checkpoint_percent" + ] + + ppo_cfg = self.config.RL.PPO + + with ( + TensorboardWriter( + self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) + if rank0_only() + else contextlib.suppress() + ) as writer: + while not self.is_done(): + profiling_wrapper.on_start_step() + profiling_wrapper.range_push("train update") + + if ppo_cfg.use_linear_clip_decay: + self.agent.clip_param = ppo_cfg.clip_param * ( + 1 - self.percent_done() + ) + + if EXIT.is_set(): + profiling_wrapper.range_pop() # train update + + self.envs.close() + + if REQUEUE.is_set() and rank0_only(): + requeue_stats = dict( + env_time=self.env_time, + pth_time=self.pth_time, + count_checkpoints=count_checkpoints, + num_steps_done=self.num_steps_done, + num_updates_done=self.num_updates_done, + _last_checkpoint_percent=self._last_checkpoint_percent, + prev_time=(time.time() - self.t_start) + prev_time, + ) + save_interrupted_state( + dict( + state_dict=self.agent.state_dict(), + optim_state=self.agent.optimizer.state_dict(), + lr_sched_state=lr_scheduler.state_dict(), + config=self.config, + requeue_stats=requeue_stats, + ) + ) + + requeue_job() + return + + self.agent.eval() + count_steps_delta = 0 + profiling_wrapper.range_push("rollouts loop") + + profiling_wrapper.range_push("_collect_rollout_step") + for buffer_index in range(self._nbuffers): + self._compute_actions_and_step_envs(buffer_index) + + for step in range(ppo_cfg.num_steps): + is_last_step = ( + self.should_end_early(step + 1) + or (step + 1) == ppo_cfg.num_steps + ) + + for buffer_index in range(self._nbuffers): + count_steps_delta += self._collect_environment_result( + buffer_index + ) + + if (buffer_index + 1) == self._nbuffers: + profiling_wrapper.range_pop() # _collect_rollout_step + + if not is_last_step: + if (buffer_index + 1) == self._nbuffers: + profiling_wrapper.range_push( + "_collect_rollout_step" + ) + + self._compute_actions_and_step_envs(buffer_index) + + if is_last_step: + break + + profiling_wrapper.range_pop() # rollouts loop + + if self._is_distributed: + self.num_rollouts_done_store.add("num_done", 1) + + ( + value_loss, + action_loss, + dist_entropy, + ) = self._update_agent() + + if ppo_cfg.use_linear_lr_decay: + lr_scheduler.step() # type: ignore + + self.num_updates_done += 1 + losses = self._coalesce_post_step( + dict(value_loss=value_loss, action_loss=action_loss), + count_steps_delta, + ) + + self._training_log(writer, losses, prev_time) + + # checkpoint model + if rank0_only() and self.should_checkpoint(): + self.save_checkpoint( + f"ckpt.{count_checkpoints}.pth", + dict( + step=self.num_steps_done, + wall_time=(time.time() - self.t_start) + prev_time, + ), + ) + count_checkpoints += 1 + + profiling_wrapper.range_pop() # train update + + self.envs.close() + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + r"""Evaluates a single checkpoint. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + if self._is_distributed: + raise RuntimeError("Evaluation does not support distributed mode") + + # Map location CPU is almost always better than mapping to a CUDA device. + ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") + + if self.config.EVAL.USE_CKPT_CONFIG: + config = self._setup_eval_config(ckpt_dict["config"]) + else: + config = self.config.clone() + + ppo_cfg = config.RL.PPO + + config.defrost() + config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT + config.freeze() + + if len(self.config.VIDEO_OPTION) > 0: + config.defrost() + config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") + config.freeze() + + if config.VERBOSE: + logger.info(f"env config: {config}") + + self._init_envs(config) + self._setup_actor_critic_agent(ppo_cfg) + + self.agent.load_state_dict(ckpt_dict["state_dict"]) + self.actor_critic = self.agent.actor_critic + + observations = self.envs.reset() + batch = batch_obs(observations, device=self.device) + batch = apply_obs_transforms_batch(batch, self.obs_transforms) + + current_episode_reward = torch.zeros( + self.envs.num_envs, 1, device="cpu" + ) + + test_recurrent_hidden_states = torch.zeros( + self.config.NUM_ENVIRONMENTS, + self.actor_critic.net.num_recurrent_layers, + ppo_cfg.hidden_size, + device=self.device, + ) + prev_actions = torch.zeros( + self.config.NUM_ENVIRONMENTS, + 1, + device=self.device, + dtype=torch.long, + ) + not_done_masks = torch.zeros( + self.config.NUM_ENVIRONMENTS, + 1, + device=self.device, + dtype=torch.bool, + ) + stats_episodes: Dict[ + Any, Any + ] = {} # dict of dicts that stores stats per episode + + rgb_frames = [ + [] for _ in range(self.config.NUM_ENVIRONMENTS) + ] # type: List[List[np.ndarray]] + if len(self.config.VIDEO_OPTION) > 0: + os.makedirs(self.config.VIDEO_DIR, exist_ok=True) + + number_of_eval_episodes = self.config.TEST_EPISODE_COUNT + if number_of_eval_episodes == -1: + number_of_eval_episodes = sum(self.envs.number_of_episodes) + else: + total_num_eps = sum(self.envs.number_of_episodes) + if total_num_eps < number_of_eval_episodes: + logger.warn( + f"Config specified {number_of_eval_episodes} eval episodes" + ", dataset only has {total_num_eps}." + ) + logger.warn(f"Evaluating with {total_num_eps} instead.") + number_of_eval_episodes = total_num_eps + + pbar = tqdm.tqdm(total=number_of_eval_episodes) + self.actor_critic.eval() + while ( + len(stats_episodes) < number_of_eval_episodes + and self.envs.num_envs > 0 + ): + current_episodes = self.envs.current_episodes() + + with torch.no_grad(): + ( + _, + actions, + _, + test_recurrent_hidden_states, + ) = self.actor_critic.act( + batch, + test_recurrent_hidden_states, + prev_actions, + not_done_masks, + deterministic=False, + ) + + prev_actions.copy_(actions) # type: ignore + + # NB: Move actions to CPU. If CUDA tensors are + # sent in to env.step(), that will create CUDA contexts + # in the subprocesses. + # For backwards compatibility, we also call .item() to convert to + # an int + step_data = [a.item() for a in actions.to(device="cpu")] + + outputs = self.envs.step(step_data) + + observations, rewards_l, dones, infos = [ + list(x) for x in zip(*outputs) + ] + batch = batch_obs(observations, device=self.device) + batch = apply_obs_transforms_batch(batch, self.obs_transforms) + + not_done_masks = torch.tensor( + [[not done] for done in dones], + dtype=torch.bool, + device="cpu", + ) + + rewards = torch.tensor( + rewards_l, dtype=torch.float, device="cpu" + ).unsqueeze(1) + current_episode_reward += rewards + next_episodes = self.envs.current_episodes() + envs_to_pause = [] + n_envs = self.envs.num_envs + for i in range(n_envs): + if ( + next_episodes[i].scene_id, + next_episodes[i].episode_id, + ) in stats_episodes: + envs_to_pause.append(i) + + # episode ended + if not not_done_masks[i].item(): + pbar.update() + episode_stats = {} + episode_stats["reward"] = current_episode_reward[i].item() + episode_stats.update( + self._extract_scalars_from_info(infos[i]) + ) + current_episode_reward[i] = 0 + # use scene_id + episode_id as unique id for storing stats + stats_episodes[ + ( + current_episodes[i].scene_id, + current_episodes[i].episode_id, + ) + ] = episode_stats + + if len(self.config.VIDEO_OPTION) > 0: + generate_video( + video_option=self.config.VIDEO_OPTION, + video_dir=self.config.VIDEO_DIR, + images=rgb_frames[i], + episode_id=current_episodes[i].episode_id, + checkpoint_idx=checkpoint_index, + metrics=self._extract_scalars_from_info(infos[i]), + tb_writer=writer, + ) + + rgb_frames[i] = [] + + # episode continues + elif len(self.config.VIDEO_OPTION) > 0: + # TODO move normalization / channel changing out of the policy and undo it here + frame = observations_to_image( + {k: v[i] for k, v in batch.items()}, infos[i] + ) + rgb_frames[i].append(frame) + + not_done_masks = not_done_masks.to(device=self.device) + ( + self.envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) = self._pause_envs( + envs_to_pause, + self.envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) + + num_episodes = len(stats_episodes) + aggregated_stats = {} + for stat_key in next(iter(stats_episodes.values())).keys(): + aggregated_stats[stat_key] = ( + sum(v[stat_key] for v in stats_episodes.values()) + / num_episodes + ) + + for k, v in aggregated_stats.items(): + logger.info(f"Average episode {k}: {v:.4f}") + + step_id = checkpoint_index + if "extra_state" in ckpt_dict and "step" in ckpt_dict["extra_state"]: + step_id = ckpt_dict["extra_state"]["step"] + + writer.add_scalars( + "eval_reward", + {"average reward": aggregated_stats["reward"]}, + step_id, + ) + + metrics = {k: v for k, v in aggregated_stats.items() if k != "reward"} + if len(metrics) > 0: + writer.add_scalars("eval_metrics", metrics, step_id) + + self.envs.close() diff --git a/habitat-lab-dialog/habitat_baselines/rl/requirements.txt b/habitat-lab-dialog/habitat_baselines/rl/requirements.txt new file mode 100644 index 0000000..c79d56b --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/rl/requirements.txt @@ -0,0 +1,5 @@ +moviepy>=1.0.1 +torch>=1.3.1 +# full tensorflow required for tensorboard video support +tensorflow==1.13.1 +tb-nightly diff --git a/habitat-lab-dialog/habitat_baselines/run.py b/habitat-lab-dialog/habitat_baselines/run.py new file mode 100644 index 0000000..a5c71b4 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/run.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import random + +import numpy as np +import torch + +from habitat.config import Config +from habitat_baselines.common.baseline_registry import baseline_registry +from habitat_baselines.config.default import get_config + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--run-type", + choices=["train", "eval"], + required=True, + help="run type of the experiment (train or eval)", + ) + parser.add_argument( + "--exp-config", + type=str, + required=True, + help="path to config yaml containing info about experiment", + ) + parser.add_argument( + "opts", + default=None, + nargs=argparse.REMAINDER, + help="Modify config options from command line", + ) + + args = parser.parse_args() + run_exp(**vars(args)) + + +def execute_exp(config: Config, run_type: str) -> None: + r"""This function runs the specified config with the specified runtype + Args: + config: Habitat.config + runtype: str {train or eval} + """ + random.seed(config.TASK_CONFIG.SEED) + np.random.seed(config.TASK_CONFIG.SEED) + torch.manual_seed(config.TASK_CONFIG.SEED) + + trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME) + assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported" + trainer = trainer_init(config) + + if run_type == "train": + trainer.train() + elif run_type == "eval": + trainer.eval() + + +def run_exp(exp_config: str, run_type: str, opts=None) -> None: + r"""Runs experiment given mode and config + + Args: + exp_config: path to config file. + run_type: "train" or "eval. + opts: list of strings of additional config options. + + Returns: + None. + """ + config = get_config(exp_config, opts) + execute_exp(config, run_type) + + +if __name__ == "__main__": + main() diff --git a/habitat-lab-dialog/habitat_baselines/slambased/README.md b/habitat-lab-dialog/habitat_baselines/slambased/README.md new file mode 100644 index 0000000..5c9f7c8 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/slambased/README.md @@ -0,0 +1,41 @@ +### Handcrafted agent baseline adopted from the paper "Benchmarking Classic and Learned Navigation in Complex 3D Environments" + +Project website: https://sites.google.com/view/classic-vs-learned-navigation +Paper: https://arxiv.org/abs/1901.10915 + +

+ +

+ +If you use this code or the provided environments in your research, please cite the following: + + @ARTICLE{Navigation2019, + author = {{Mishkin}, Dmytro and {Dosovitskiy}, Alexey and {Koltun}, Vladlen}, + title = "{Benchmarking Classic and Learned Navigation in Complex 3D Environments}", + year = 2019, + month = Jan, + archivePrefix = {arXiv}, + eprint = {1901.10915}, + } + + + +## Dependencies: + +- conda +- numpy +- pytorch +- ORBSLAM2 + + +## Tested with: +- Ubuntu 16.04 +- python 3.6 +- pytorch 0.4, 1.0 + + +- Install Anaconda https://www.anaconda.com/download/#linux + +- Install dependencies via ./install_deps.sh. It should install everything except the datasets. + +Simple example of working with agents is shown in (../handcrafted-agent-example.ipynb) diff --git a/habitat-lab-dialog/habitat_baselines/slambased/install_deps.sh b/habitat-lab-dialog/habitat_baselines/slambased/install_deps.sh new file mode 100755 index 0000000..659fe7d --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/slambased/install_deps.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +DIR1=$(pwd) +MAINDIR=$(pwd)/3rdparty +mkdir "${MAINDIR}" +cd "${MAINDIR}" || exit +#conda create -y -n "HandcraftedAgents" python=3.6 +source activate HandcraftedAgents +conda install opencv -y +conda install pytorch torchvision -c pytorch -y +conda install -c conda-forge imageio -y +conda install ffmpeg -c conda-forge -y +cd "${MAINDIR}" || exit +mkdir eigen3 +cd eigen3 || exit +wget https://gitlab.com/libeigen/eigen/-/archive/3.3.5/eigen-3.3.5.tar.gz +tar -xzf eigen-3.3.5.tar.gz +cd eigen-3.3.5 || exit +mkdir build +cd build || exit +cmake .. -DCMAKE_INSTALL_PREFIX="${MAINDIR}"/eigen3_installed/ +make install +cd "${MAINDIR}" || exit +wget https://sourceforge.net/projects/glew/files/glew/2.1.0/glew-2.1.0.zip +unzip glew-2.1.0.zip +cd glew-2.1.0/ || exit +cd build || exit +cmake ./cmake -DCMAKE_INSTALL_PREFIX="${MAINDIR}"/glew_installed +make -j4 +make install +cd "${MAINDIR}" || exit +#pip install numpy --upgrade +rm Pangolin -rf +git clone https://github.com/stevenlovegrove/Pangolin.git +cd Pangolin || exit +mkdir build +cd build || exit +cmake .. -DCMAKE_PREFIX_PATH="${MAINDIR}"/glew_installed/ -DCMAKE_LIBRARY_PATH="${MAINDIR}"/glew_installed/lib/ -DCMAKE_INSTALL_PREFIX="${MAINDIR}"/pangolin_installed +cmake --build . +cd "${MAINDIR}" || exit +rm ORB_SLAM2 -rf +rm ORB_SLAM2-PythonBindings -rf +git clone https://github.com/ducha-aiki/ORB_SLAM2 +git clone https://github.com/ducha-aiki/ORB_SLAM2-PythonBindings +cd "${MAINDIR}"/ORB_SLAM2 || exit +sed -i "s,cmake .. -DCMAKE_BUILD_TYPE=Release,cmake .. -DCMAKE_BUILD_TYPE=Release -DEIGEN3_INCLUDE_DIR=${MAINDIR}/eigen3_installed/include/eigen3/ -DCMAKE_INSTALL_PREFIX=${MAINDIR}/ORBSLAM2_installed ,g" build.sh +ln -s "${MAINDIR}"/eigen3_installed/include/eigen3/Eigen "${MAINDIR}"/ORB_SLAM2/Thirdparty/g2o/g2o/core/Eigen +./build.sh +cd build || exit +make install +cd "${MAINDIR}" || exit +cd ORB_SLAM2-PythonBindings/src || exit +ln -s "${MAINDIR}"/eigen3_installed/include/eigen3/Eigen Eigen +cd "${MAINDIR}"/ORB_SLAM2-PythonBindings || exit +mkdir build +cd build || exit +CONDA_DIR="$(dirname $(dirname $(which conda)))" +CONDA_DIR=\"${CONDA_DIR}/envs/HandcraftedAgents/lib/python3.6/site-packages/\" +sed -i "s,lib/python3.5/dist-packages,${CONDA_DIR},g" ../CMakeLists.txt +cmake .. -DPYTHON_INCLUDE_DIR=$(python -c "from distutils.sysconfig import get_python_inc; print(get_python_inc())") -DPYTHON_LIBRARY=$(python -c "import distutils.sysconfig as sysconfig; print(sysconfig.get_config_var('LIBDIR'))")/libpython3.6m.so -DPYTHON_EXECUTABLE:FILEPATH=$(which python) -DCMAKE_LIBRARY_PATH="${MAINDIR}"/ORBSLAM2_installed/lib -DCMAKE_INCLUDE_PATH="${MAINDIR}"/ORBSLAM2_installed/include;"${MAINDIR}"/eigen3_installed/include/eigen3 -DCMAKE_INSTALL_PREFIX="${MAINDIR}"/pyorbslam2_installed +make +make install +cp "${MAINDIR}"/ORB_SLAM2/Vocabulary/ORBvoc.txt "${DIR1}"/data/ diff --git a/habitat-lab-dialog/habitat_baselines/slambased/mappers.py b/habitat-lab-dialog/habitat_baselines/slambased/mappers.py new file mode 100644 index 0000000..0df1758 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/slambased/mappers.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +from torch import nn as nn + +from habitat_baselines.slambased.reprojection import ( + get_map_size_in_cells, + project2d_pcl_into_worldmap, + reproject_local_to_global, +) + + +def depth2local3d(depth, fx, fy, cx, cy): + r"""Projects depth map to 3d point cloud + with origin in the camera focus + """ + device = depth.device + h, w = depth.squeeze().size() + x = torch.linspace(0, w - 1, w).to(device) + y = torch.linspace(0, h - 1, h).to(device) + xv, yv = torch.meshgrid([x, y]) + dfl = depth.t().flatten() + return torch.cat( + [ + (dfl * (xv.flatten() - cx) / fx).unsqueeze(-1), # x + (dfl * (yv.flatten() - cy) / fy).unsqueeze(-1), # y + dfl.unsqueeze(-1), + ], + dim=1, + ) # z + + +def pcl_to_obstacles(pts3d, map_size=40, cell_size=0.2, min_pts=10): + r"""Counts number of 3d points in 2d map cell. + Height is sum-pooled. + """ + device = pts3d.device + map_size_in_cells = get_map_size_in_cells(map_size, cell_size) - 1 + init_map = torch.zeros( + (map_size_in_cells, map_size_in_cells), device=device + ) + if len(pts3d) <= 1: + return init_map + num_pts, dim = pts3d.size() + pts2d = torch.cat([pts3d[:, 2:3], pts3d[:, 0:1]], dim=1) + data_idxs = torch.round( + project2d_pcl_into_worldmap(pts2d, map_size, cell_size) + ) + if len(data_idxs) > min_pts: + u, counts = np.unique( + data_idxs.detach().cpu().numpy(), axis=0, return_counts=True + ) + init_map[u[:, 0], u[:, 1]] = torch.from_numpy(counts).to( + dtype=torch.float32, device=device + ) + return init_map + + +class DirectDepthMapper(nn.Module): + r"""Estimates obstacle map given the depth image + ToDo: replace numpy histogram counting with differentiable + pytorch soft count like in + https://papers.nips.cc/paper/7545-unsupervised-learning-of-shape-and-pose-with-differentiable-point-clouds.pdf + """ + + def __init__( + self, + camera_height=0, + near_th=0.1, + far_th=4.0, + h_min=0.0, + h_max=1.0, + map_size=40, + map_cell_size=0.1, + device=torch.device("cpu"), # noqa: B008 + **kwargs + ): + super(DirectDepthMapper, self).__init__() + self.device = device + self.near_th = near_th + self.far_th = far_th + self.h_min_th = h_min + self.h_max_th = h_max + self.camera_height = camera_height + self.map_size_meters = map_size + self.map_cell_size = map_cell_size + return + + def forward(self, depth, pose=torch.eye(4).float()): # noqa: B008 + self.device = depth.device + # Works for FOV = 90 degrees + # Should be adjusted, if FOV changed + self.fx = float(depth.size(1)) / 2.0 + self.fy = float(depth.size(0)) / 2.0 + self.cx = int(self.fx) - 1 + self.cy = int(self.fy) - 1 + pose = pose.to(self.device) + local_3d_pcl = depth2local3d(depth, self.fx, self.fy, self.cx, self.cy) + idxs = (torch.abs(local_3d_pcl[:, 2]) < self.far_th) * ( + torch.abs(local_3d_pcl[:, 2]) >= self.near_th + ) + survived_points = local_3d_pcl[idxs] + if len(survived_points) < 20: + map_size_in_cells = ( + get_map_size_in_cells(self.map_size_meters, self.map_cell_size) + - 1 + ) + init_map = torch.zeros( + (map_size_in_cells, map_size_in_cells), device=self.device + ) + return init_map + global_3d_pcl = reproject_local_to_global(survived_points, pose)[:, :3] + # Because originally y looks down and from agent camera height + global_3d_pcl[:, 1] = -global_3d_pcl[:, 1] + self.camera_height + idxs = (global_3d_pcl[:, 1] > self.h_min_th) * ( + global_3d_pcl[:, 1] < self.h_max_th + ) + global_3d_pcl = global_3d_pcl[idxs] + obstacle_map = pcl_to_obstacles( + global_3d_pcl, self.map_size_meters, self.map_cell_size + ) + return obstacle_map diff --git a/habitat-lab-dialog/habitat_baselines/slambased/monodepth.py b/habitat-lab-dialog/habitat_baselines/slambased/monodepth.py new file mode 100644 index 0000000..50906f4 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/slambased/monodepth.py @@ -0,0 +1,676 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +r"""The code below is taken from +https://github.com/JunjH/Revisiting_Single_Depth_Estimation +Revisiting Single Image Depth Estimation: Toward Higher Resolution Maps With Accurate Object Boundaries +Junjie Hu and Mete Ozay and Yan Zhang and Takayuki Okatani +WACV 2019 + +ResNet code gently borrowed from +https://github.com/pytorch/vision/blob/master/torchvision/models/py +""" + + +import math + +import numpy as np +import torch +import torch.nn.parallel +from PIL import Image +from torch import nn as nn +from torch.nn import functional as F +from torch.utils import model_zoo as model_zoo +from torchvision import transforms + +accimage = None + + +__all__ = [ + "ResNet", + "resnet18", + "resnet34", + "resnet50", + "resnet101", + "resnet152", +] + + +model_urls = { + "resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth", + "resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth", + "resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth", + "resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth", + "resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth", +} + + +def conv3x3(in_planes, out_planes, stride=1): + "3x3 convolution with padding" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + ) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(inplanes, planes, stride) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, downsample=None): + super(Bottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, stride=stride, padding=1, bias=False + ) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class ResNet(nn.Module): + def __init__(self, block, layers, num_classes=1000): + self.inplanes = 64 + super(ResNet, self).__init__() + self.conv1 = nn.Conv2d( + 3, 64, kernel_size=7, stride=2, padding=3, bias=False + ) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) + self.avgpool = nn.AvgPool2d(7, stride=1) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2.0 / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_layer(self, block, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False, + ), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append(block(self.inplanes, planes, stride, downsample)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + x = self.avgpool(x) + x = x.view(x.size(0), -1) + x = self.fc(x) + + return x + + +def resnet18(pretrained=False, **kwargs): + r"""Constructs a ResNet-18 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls["resnet18"])) + return model + + +def resnet34(pretrained=False, **kwargs): + r"""Constructs a ResNet-34 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls["resnet34"])) + return model + + +def resnet50(pretrained=False, **kwargs): + r"""Constructs a ResNet-50 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) + if pretrained: + model.load_state_dict( + model_zoo.load_url( + model_urls["resnet50"], "pretrained_model/encoder" + ) + ) + return model + + +def resnet101(pretrained=False, **kwargs): + r"""Constructs a ResNet-101 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls["resnet101"])) + return model + + +def resnet152(pretrained=False, **kwargs): + r"""Constructs a ResNet-152 model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) + if pretrained: + model.load_state_dict(model_zoo.load_url(model_urls["resnet152"])) + return model + + +class model(nn.Module): + def __init__(self, Encoder, num_features, block_channel): + + super(model, self).__init__() + + self.E = Encoder + self.D = D(num_features) + self.MFF = MFF(block_channel) + self.R = R(block_channel) + + def forward(self, x): + x_block1, x_block2, x_block3, x_block4 = self.E(x) + x_decoder = self.D(x_block1, x_block2, x_block3, x_block4) + x_mff = self.MFF( + x_block1, + x_block2, + x_block3, + x_block4, + [x_decoder.size(2), x_decoder.size(3)], + ) + out = self.R(torch.cat((x_decoder, x_mff), 1)) + + return out + + +class _UpProjection(nn.Sequential): + def __init__(self, num_input_features, num_output_features): + super(_UpProjection, self).__init__() + + self.conv1 = nn.Conv2d( + num_input_features, + num_output_features, + kernel_size=5, + stride=1, + padding=2, + bias=False, + ) + self.bn1 = nn.BatchNorm2d(num_output_features) + self.relu = nn.ReLU(inplace=True) + self.conv1_2 = nn.Conv2d( + num_output_features, + num_output_features, + kernel_size=3, + stride=1, + padding=1, + bias=False, + ) + self.bn1_2 = nn.BatchNorm2d(num_output_features) + + self.conv2 = nn.Conv2d( + num_input_features, + num_output_features, + kernel_size=5, + stride=1, + padding=2, + bias=False, + ) + self.bn2 = nn.BatchNorm2d(num_output_features) + + def forward(self, x, size): + x = F.upsample(x, size=size, mode="bilinear") + x_conv1 = self.relu(self.bn1(self.conv1(x))) + bran1 = self.bn1_2(self.conv1_2(x_conv1)) + bran2 = self.bn2(self.conv2(x)) + + out = self.relu(bran1 + bran2) + + return out + + +class E_resnet(nn.Module): + def __init__(self, original_model, num_features=2048): + super(E_resnet, self).__init__() + self.conv1 = original_model.conv1 + self.bn1 = original_model.bn1 + self.relu = original_model.relu + self.maxpool = original_model.maxpool + + self.layer1 = original_model.layer1 + self.layer2 = original_model.layer2 + self.layer3 = original_model.layer3 + self.layer4 = original_model.layer4 + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + + x_block1 = self.layer1(x) + x_block2 = self.layer2(x_block1) + x_block3 = self.layer3(x_block2) + x_block4 = self.layer4(x_block3) + + return x_block1, x_block2, x_block3, x_block4 + + +class D(nn.Module): + def __init__(self, num_features=2048): + super(D, self).__init__() + self.conv = nn.Conv2d( + num_features, + num_features // 2, + kernel_size=1, + stride=1, + bias=False, + ) + num_features = num_features // 2 + self.bn = nn.BatchNorm2d(num_features) + + self.up1 = _UpProjection( + num_input_features=num_features, + num_output_features=num_features // 2, + ) + num_features = num_features // 2 + + self.up2 = _UpProjection( + num_input_features=num_features, + num_output_features=num_features // 2, + ) + num_features = num_features // 2 + + self.up3 = _UpProjection( + num_input_features=num_features, + num_output_features=num_features // 2, + ) + num_features = num_features // 2 + + self.up4 = _UpProjection( + num_input_features=num_features, + num_output_features=num_features // 2, + ) + num_features = num_features // 2 + + def forward(self, x_block1, x_block2, x_block3, x_block4): + x_d0 = F.relu(self.bn(self.conv(x_block4))) + x_d1 = self.up1(x_d0, [x_block3.size(2), x_block3.size(3)]) + x_d2 = self.up2(x_d1, [x_block2.size(2), x_block2.size(3)]) + x_d3 = self.up3(x_d2, [x_block1.size(2), x_block1.size(3)]) + x_d4 = self.up4(x_d3, [x_block1.size(2) * 2, x_block1.size(3) * 2]) + + return x_d4 + + +class MFF(nn.Module): + def __init__(self, block_channel, num_features=64): + + super(MFF, self).__init__() + + self.up1 = _UpProjection( + num_input_features=block_channel[0], num_output_features=16 + ) + + self.up2 = _UpProjection( + num_input_features=block_channel[1], num_output_features=16 + ) + + self.up3 = _UpProjection( + num_input_features=block_channel[2], num_output_features=16 + ) + + self.up4 = _UpProjection( + num_input_features=block_channel[3], num_output_features=16 + ) + + self.conv = nn.Conv2d( + num_features, + num_features, + kernel_size=5, + stride=1, + padding=2, + bias=False, + ) + self.bn = nn.BatchNorm2d(num_features) + + def forward(self, x_block1, x_block2, x_block3, x_block4, size): + x_m1 = self.up1(x_block1, size) + x_m2 = self.up2(x_block2, size) + x_m3 = self.up3(x_block3, size) + x_m4 = self.up4(x_block4, size) + + x = self.bn(self.conv(torch.cat((x_m1, x_m2, x_m3, x_m4), 1))) + x = F.relu(x) + + return x + + +class R(nn.Module): + def __init__(self, block_channel): + + super(R, self).__init__() + + num_features = 64 + block_channel[3] // 32 + self.conv0 = nn.Conv2d( + num_features, + num_features, + kernel_size=5, + stride=1, + padding=2, + bias=False, + ) + self.bn0 = nn.BatchNorm2d(num_features) + + self.conv1 = nn.Conv2d( + num_features, + num_features, + kernel_size=5, + stride=1, + padding=2, + bias=False, + ) + self.bn1 = nn.BatchNorm2d(num_features) + + self.conv2 = nn.Conv2d( + num_features, 1, kernel_size=5, stride=1, padding=2, bias=True + ) + + def forward(self, x): + x0 = self.conv0(x) + x0 = self.bn0(x0) + x0 = F.relu(x0) + + x1 = self.conv1(x0) + x1 = self.bn1(x1) + x1 = F.relu(x1) + + x2 = self.conv2(x1) + + return x2 + + +def _is_pil_image(img): + return isinstance(img, Image.Image) + + +def _is_numpy_image(img): + return isinstance(img, np.ndarray) and (img.ndim in {2, 3}) + + +class Scale(object): + def __init__(self, size): + self.size = size + + def __call__(self, image): + image = self.changeScale(image, self.size) + + return image + + def changeScale(self, img, size, interpolation=Image.BILINEAR): + ow, oh = size + + return img.resize((ow, oh), interpolation) + + +class CenterCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, image): + image = self.centerCrop(image, self.size) + + return image + + def centerCrop(self, image, size): + w1, h1 = image.size + tw, th = size + + if w1 == tw and h1 == th: + return image + + x1 = int(round((w1 - tw) / 2.0)) + y1 = int(round((h1 - th) / 2.0)) + + image = image.crop((x1, y1, tw + x1, th + y1)) + + return image + + +class ToTensor(object): + r"""Convert a ``PIL.Image`` or ``numpy.ndarray`` to tensor. + Converts a PIL.Image or numpy.ndarray (H x W x C) in the range + [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]. + """ + + def __call__(self, image): + image = self.to_tensor(image) + + return image + + def to_tensor(self, pic): + if not (_is_pil_image(pic) or _is_numpy_image(pic)): + raise TypeError( + "pic should be PIL Image or ndarray. Got {}".format(type(pic)) + ) + + if isinstance(pic, np.ndarray): + + img = torch.from_numpy(pic.transpose((2, 0, 1))) + return img.float().div(255) + + if accimage is not None and isinstance(pic, accimage.Image): + nppic = np.zeros( + [pic.channels, pic.height, pic.width], dtype=np.float32 + ) + pic.copyto(nppic) + return torch.from_numpy(nppic) + + # handle PIL Image + if pic.mode == "I": + img = torch.from_numpy(np.array(pic, np.int32, copy=False)) + elif pic.mode == "I;16": + img = torch.from_numpy(np.array(pic, np.int16, copy=False)) + else: + img = torch.ByteTensor( + torch.ByteStorage.from_buffer(pic.tobytes()) + ) + # PIL image mode: 1, L, P, I, F, RGB, YCbCr, RGBA, CMYK + if pic.mode == "YCbCr": + nchannel = 3 + elif pic.mode == "I;16": + nchannel = 1 + else: + nchannel = len(pic.mode) + img = img.view(pic.size[1], pic.size[0], nchannel) + # put it from HWC to CHW format + # yikes, this transpose takes 80% of the loading time/CPU + img = img.transpose(0, 1).transpose(0, 2).contiguous() + if isinstance(img, torch.ByteTensor): + return img.float().div(255) + else: + return img + + +class Normalize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, image): + image = self.normalize(image, self.mean, self.std) + + return image + + def normalize(self, tensor, mean, std): + for t, m, s in zip(tensor, mean, std): + t.sub_(m).div_(s) + + return tensor + + +def define_model(is_resnet, is_densenet, is_senet): + if is_resnet: + original_model = resnet50(pretrained=False) + Encoder = E_resnet(original_model) + model1 = model( + Encoder, num_features=2048, block_channel=[256, 512, 1024, 2048] + ) + if is_densenet: + # original_model = dendensenet161(pretrained=False) + # Encoder = E_densenet(original_model) + # model1 = model( + # Encoder, num_features=2208, block_channel=[192, 384, 1056, 2208] + # ) + raise NotImplementedError() + if is_senet: + # original_model = senet154(pretrained=False) + # Encoder = E_senet(original_model) + # model1 = model( + # Encoder, num_features=2048, block_channel=[256, 512, 1024, 2048] + # ) + raise NotImplementedError() + return model1 + + +class MonoDepthEstimator: + def __init__(self, checkpoint="./pretrained_model/model_resnet"): + self.model = define_model( + is_resnet=True, is_densenet=False, is_senet=False + ) + self.model = torch.nn.DataParallel(self.model).cuda() + cpt = torch.load(checkpoint) + if "state_dict" in cpt: + cpt = cpt["state_dict"] + self.model.load_state_dict(cpt) + self.model.eval() + self.init_preprocessor() + + def init_preprocessor(self): + __imagenet_stats = { + "mean": [0.485, 0.456, 0.406], + "std": [0.229, 0.224, 0.225], + } + + self.transform = transforms.Compose( + [ + Scale([320, 240]), + # CenterCrop([304, 228]), + ToTensor(), + Normalize(__imagenet_stats["mean"], __imagenet_stats["std"]), + ] + ) + + def preprocess(self, image): + image_torch = self.transform(image).unsqueeze(0) + return image_torch.cuda() + + def compute_depth(self, image): + # Input: image is a PIL image + # Output: depth is a numpy array + image_torch = self.preprocess(image) + # print(image_torch.size()) + depth_torch = self.model(image_torch) + depth = ( + depth_torch.view(depth_torch.size(2), depth_torch.size(3)) + .data.cpu() + .numpy() + ) + return depth diff --git a/habitat-lab-dialog/habitat_baselines/slambased/path_planners.py b/habitat-lab-dialog/habitat_baselines/slambased/path_planners.py new file mode 100644 index 0000000..264d785 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/slambased/path_planners.py @@ -0,0 +1,515 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +from matplotlib import pyplot as plt +from torch import nn as nn +from torch.nn import functional as F + +from habitat_baselines.slambased.utils import generate_2dgrid + + +def safe_roi_2d(array2d, ymin, ymax, xmin, xmax): + (h, w) = array2d.shape + return max(0, ymin), min(ymax, h), max(0, xmin), min(xmax, w) + + +def f2ind(ten, i): + # Float to index + return torch.round(ten[i]).long() + + +def init_neights_to_channels(ks=3): + r"""Convolutional kernel, + which maps nighborhood into channels + """ + weights = np.zeros((ks * ks, 1, ks, ks), dtype=np.float32) + for y in range(ks): + for x in range(ks): + weights[x * ks + y, 0, y, x] = 1.0 + return weights + + +class SoftArgMin(nn.Module): + def __init__(self, beta=5): + super(SoftArgMin, self).__init__() + self.beta = beta + return + + def forward(self, x, coords2d=None): + bx_sm = F.softmax(self.beta * (-x).view(1, -1), dim=1) + if coords2d is None: + coords2d = generate_2dgrid(x.size(2), x.size(3), False) + coords2d_flat = coords2d.view(2, -1) + return (bx_sm.expand_as(coords2d_flat) * coords2d_flat).sum( + dim=1 + ) / bx_sm.sum(dim=1) + + +class HardArgMin(nn.Module): + def __init__(self): + super(HardArgMin, self).__init__() + return + + def forward(self, x, coords2d=None): + val, idx = x.view(-1).min(dim=0) + if coords2d is None: + coords2d = generate_2dgrid(x.size(2), x.size(3), False) + coords2d_flat = coords2d.view(2, -1) + return coords2d_flat[:, idx].view(2) + + +class DifferentiableStarPlanner(nn.Module): + def __init__( + self, + max_steps=500, + visualize=False, + preprocess=False, + beta=100, + connectivity="eight", + device=torch.device("cpu"), # noqa: B008 + **kwargs + ): + super(DifferentiableStarPlanner, self).__init__() + self.eps = 1e-12 + self.max_steps = max_steps + self.visualize = visualize + self.inf = 1e7 + self.ob_cost = 10000.0 + self.device = device + self.beta = beta + self.preprocess = preprocess + # self.argmin = SoftArgMin(beta) + self.argmin = HardArgMin() + self.neights2channels = nn.Conv2d(1, 9, kernel_size=(3, 3), bias=False) + self.neights2channels.weight.data = torch.from_numpy( + init_neights_to_channels(3) + ) + self.neights2channels.to(device) + self.preprocessNet = nn.Conv2d( + 1, 1, kernel_size=(3, 3), padding=1, bias=False + ) + self.preprocessNet.weight.data = torch.from_numpy( + np.array( + [ + [ + [ + [0.00001, 0.0001, 0.00001], + [0.0001, 1, 0.0001], + [0.00001, 0.0001, 0.00001], + ] + ] + ], + dtype=np.float32, + ) + ) + self.preprocessNet.to(device) + if connectivity == "eight": + self.gx_to_right = nn.Conv2d(1, 1, kernel_size=(1, 3), bias=False) + self.gx_to_right.weight.data = torch.from_numpy( + np.array([[[[0, 1, -1]]]], dtype=np.float32) + ) + self.gx_to_right.to(device) + + self.gx_to_left = nn.Conv2d(1, 1, kernel_size=(1, 3), bias=False) + self.gx_to_left.weight.data = torch.from_numpy( + np.array([[[[-1, 1, 0]]]], dtype=np.float32) + ) + self.gx_to_left.to(device) + + self.gy_to_up = nn.Conv2d(1, 1, kernel_size=(3, 1), bias=False) + self.gy_to_up.weight.data = torch.from_numpy( + np.array([[[[0], [1], [-1]]]], dtype=np.float32) + ) + self.gy_to_up.to(device) + + self.gy_to_down = nn.Conv2d(1, 1, kernel_size=(3, 1), bias=False) + self.gy_to_down.weight.data = torch.from_numpy( + np.array([[[[-1], [1], [0]]]], dtype=np.float32) + ) + self.gy_to_down.to(device) + else: + raise ValueError('Only "eight" connectivity now supported') + return + + def preprocess_obstacle_map(self, obstacle_map): + if self.preprocess: + return self.preprocessNet(obstacle_map) + return obstacle_map + + def coords2grid(self, node_coords, h, w): + grid = node_coords.squeeze() - torch.FloatTensor( + (h / 2.0, w / 2.0) + ).to(self.device) + grid = grid / torch.FloatTensor((h / 2.0, w / 2.0)).to(self.device) + return grid.view(1, 1, 1, 2).flip(3) + + def init_closelistmap(self): + return torch.zeros_like(self.start_map).float() + + def init_openlistmap(self): + return self.start_map.clone() + + def init_g_map(self): + return torch.clamp( + self.inf + * (torch.ones_like(self.start_map) - self.start_map.clone()), + min=0, + max=self.inf, + ) + + def safe_roi_2d(self, ymin, ymax, xmin, xmax): + return ( + int(max(0, torch.round(ymin).item())), + int(min(torch.round(ymax).item(), self.height)), + int(max(0, torch.round(xmin).item())), + int(min(torch.round(xmax).item(), self.width)), + ) + + def forward( + self, + obstacles, + coords, + start_map, + goal_map, + non_obstacle_cost_map=None, + additional_steps=50, + return_path=True, + ): + self.trav_init_time = 0 + self.trav_mask_time = 0 + self.trav_soft_time = 0 + self.conv_time = 0 + self.close_time = 0 + + self.obstacles = self.preprocess_obstacle_map( + obstacles.to(self.device) + ) + self.start_map = start_map.to(self.device) + self.been_there = torch.zeros_like(self.start_map).to( + torch.device("cpu") + ) + self.coords = coords.to(self.device) + self.goal_map = goal_map.to(self.device) + self.been_there = torch.zeros_like(self.goal_map).to(self.device) + self.height = obstacles.size(2) + self.width = obstacles.size(3) + m, goal_idx = torch.max(self.goal_map.view(-1), 0) + c_map = self.calculate_local_path_costs(non_obstacle_cost_map) + # c_map might be non persistent in map update + self.g_map = self.init_g_map() + self.close_list_map = self.init_closelistmap() + self.open_list_map = self.init_openlistmap() + not_done = False + step = 0 + stopped_by_max_iter = False + if self.visualize: + self.fig, self.ax = plt.subplots(1, 1) + self.image = self.ax.imshow( + self.g_map.squeeze().cpu().detach().numpy().astype(np.float32), + animated=True, + ) + self.fig.canvas.draw() + not_done = (self.close_list_map.view(-1)[goal_idx].item() < 1.0) or ( + self.g_map.view(-1)[goal_idx].item() >= 0.9 * self.ob_cost + ) + rad = 1 + self.start_coords = ( + (self.coords * self.start_map.expand_as(self.coords)) + .sum(dim=2) + .sum(dim=2) + .squeeze() + ) + node_coords = self.start_coords + self.goal_coords = ( + (self.coords * self.goal_map.expand_as(self.coords)) + .sum(dim=2) + .sum(dim=2) + .squeeze() + ) + self.max_steps = 4 * int( + torch.sqrt( + ((self.start_coords - self.goal_coords) ** 2).sum() + 1e-6 + ).item() + ) + while not_done: + ymin, ymax, xmin, xmax = self.safe_roi_2d( + node_coords[0] - rad, + node_coords[0] + rad + 1, + node_coords[1] - rad, + node_coords[1] + rad + 1, + ) + if ( + (ymin - 1 > 0) + and (xmin - 1 > 0) + and (ymax + 1 < self.height) + and (xmax + 1 < self.width) + ): + n2c = self.neights2channels( + self.g_map[:, :, ymin - 1 : ymax + 1, xmin - 1 : xmax + 1] + ) + self.g_map[:, :, ymin:ymax, xmin:xmax] = torch.min( + self.g_map[:, :, ymin:ymax, xmin:xmax].clone(), + (n2c + c_map[:, :, ymin:ymax, xmin:xmax]).min( + dim=1, keepdim=True + )[0], + ) + self.close_list_map[:, :, ymin:ymax, xmin:xmax] = torch.max( + self.close_list_map[:, :, ymin:ymax, xmin:xmax], + self.open_list_map[:, :, ymin:ymax, xmin:xmax], + ) + self.open_list_map[:, :, ymin:ymax, xmin:xmax] = F.relu( + F.max_pool2d( + self.open_list_map[ + :, :, ymin - 1 : ymax + 1, xmin - 1 : xmax + 1 + ], + 3, + stride=1, + padding=0, + ) + - self.close_list_map[:, :, ymin:ymax, xmin:xmax] + - self.obstacles[:, :, ymin:ymax, xmin:xmax] + ) + else: + self.g_map = torch.min( + self.g_map, + ( + self.neights2channels( + F.pad(self.g_map, (1, 1, 1, 1), "replicate") + ) + + c_map + ).min(dim=1, keepdim=True)[0], + ) + self.close_list_map = torch.max( + self.close_list_map, self.open_list_map + ) + self.open_list_map = F.relu( + F.max_pool2d(self.open_list_map, 3, stride=1, padding=1) + - self.close_list_map + - self.obstacles + ) + step += 1 + if step >= self.max_steps: + stopped_by_max_iter = True + break + not_done = ( + self.close_list_map.view(-1)[goal_idx].item() < 1.0 + ) or (self.g_map.view(-1)[goal_idx].item() >= 0.1 * self.inf) + rad += 1 + if not stopped_by_max_iter: + for _ in range(additional_steps): + # now propagating beyong start point + self.g_map = torch.min( + self.g_map, + ( + self.neights2channels( + F.pad(self.g_map, (1, 1, 1, 1), "replicate") + ) + + c_map + ).min(dim=1, keepdim=True)[0], + ) + self.close_list_map = torch.max( + self.close_list_map, self.open_list_map + ) + self.open_list_map = F.relu( + F.max_pool2d(self.open_list_map, 3, stride=1, padding=1) + - self.close_list_map + - self.obstacles + ) + if return_path: + out_path, cost = self.reconstruct_path() + return out_path, cost + return None + + def calculate_local_path_costs(self, non_obstacle_cost_map=None): + coords = self.coords + h = coords.size(2) + w = coords.size(3) + obstacles_pd = F.pad(self.obstacles, (1, 1, 1, 1), "replicate") + if non_obstacle_cost_map is None: + learned_bias = torch.ones_like(self.obstacles).to( + obstacles_pd.device + ) + else: + learned_bias = non_obstacle_cost_map.to(obstacles_pd.device) + left_diff_sq = ( + self.gx_to_left( + F.pad(coords[:, 1:2, :, :], (1, 1, 0, 0), "replicate") + ) + ** 2 + ) + right_diff_sq = ( + self.gx_to_right( + F.pad(coords[:, 1:2, :, :], (1, 1, 0, 0), "replicate") + ) + ** 2 + ) + up_diff_sq = ( + self.gy_to_up( + F.pad(coords[:, 0:1, :, :], (0, 0, 1, 1), "replicate") + ) + ** 2 + ) + down_diff_sq = ( + self.gy_to_down( + F.pad(coords[:, 0:1, :, :], (0, 0, 1, 1), "replicate") + ) + ** 2 + ) + out = torch.cat( + [ + # Order in from up to down, from left to right + # hopefully same as in PyTorch + torch.sqrt(left_diff_sq + up_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 0:h, 0:w], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + torch.sqrt(left_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 0:h, 1 : w + 1], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + torch.sqrt(left_diff_sq + down_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 2 : h + 2, 0:w], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + torch.sqrt(up_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 0:h, 1 : w + 1], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + 0 * right_diff_sq + + self.ob_cost + * obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], # current center + torch.sqrt(down_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 2 : h + 2, 1 : w + 1], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + torch.sqrt(right_diff_sq + up_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 0:h, 2 : w + 2], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + torch.sqrt(right_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 1 : h + 1, 2 : w + 2], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + torch.sqrt(right_diff_sq + down_diff_sq + self.eps) + + self.ob_cost + * torch.max( + obstacles_pd[:, :, 2 : h + 2, 2 : w + 2], + obstacles_pd[:, :, 1 : h + 1, 1 : w + 1], + ), + ], + dim=1, + ) + return out + torch.clamp( + learned_bias.expand_as(out), min=0, max=self.ob_cost + ) + + def propagate_traversal(self, node_coords, close, g, coords): + ymin, ymax, xmin, xmax = self.safe_roi_2d( + node_coords[0] - 1, + node_coords[0] + 2, + node_coords[1] - 1, + node_coords[1] + 2, + ) + mask = close[:, :, ymin:ymax, xmin:xmax] > 0 + mask[ + :, :, f2ind(node_coords, 0) - ymin, f2ind(node_coords, 1) - xmin + ] = 0 + mask = mask > 0 + current_g_cost = g[:, :, ymin:ymax, xmin:xmax][mask].clone() + if len(current_g_cost.view(-1)) == 0: + # we are kind surrounded by obstacles, + # but still need to output something + mask = torch.relu( + 1.0 - self.been_there[:, :, ymin:ymax, xmin:xmax] + ) + mask[ + :, + :, + f2ind(node_coords, 0) - ymin, + f2ind(node_coords, 1) - xmin, + ] = 0 + mask = mask > 0 + current_g_cost = g[:, :, ymin:ymax, xmin:xmax][mask].clone() + if len(current_g_cost.view(-1)) > 1: + current_g_cost = current_g_cost - torch.min(current_g_cost).item() + current_g_cost = ( + current_g_cost + + 0.41 + * torch.randperm( + len(current_g_cost), + dtype=torch.float32, + device=torch.device("cpu"), + ) + / (len(current_g_cost)) + ) + # + coords_roi = coords[:, :, ymin:ymax, xmin:xmax] + out = self.argmin( + current_g_cost, coords_roi[mask.expand_as(coords_roi)] + ) + return out + + def get_clean_costmap_and_goodmask(self): + good_mask = 1 - F.max_pool2d(self.obstacles, 3, stride=1, padding=1) + costmap = self.g_map + obstacle_cost_corrected = 10000.0 + sampling_map = torch.clamp(costmap, min=0, max=obstacle_cost_corrected) + return sampling_map, good_mask + + def reconstruct_path(self): + out_path = [] + goal_coords = self.goal_coords.cpu() + start_coords = self.start_coords.cpu() + + cost = self.g_map[:, :, f2ind(goal_coords, 0), f2ind(goal_coords, 1)] + # Traversing + done = False + node_coords = goal_coords.cpu() + out_path.append(node_coords) + self.been_there = 0 * self.been_there.cpu() + self.been_there[ + :, :, f2ind(node_coords, 0), f2ind(node_coords, 1) + ] = 1.0 + self.close_list_map = self.close_list_map.cpu() + self.g_map = self.g_map.cpu() + self.coords = self.coords.cpu() + count1 = 0 + while not done: + node_coords = self.propagate_traversal( + node_coords, self.close_list_map, self.g_map, self.coords + ) + self.been_there[ + :, :, f2ind(node_coords, 0), f2ind(node_coords, 1) + ] = 1.0 + if torch.norm(node_coords - out_path[-1], 2).item() < 0.3: + y = node_coords.flatten()[0].long() + x = node_coords.flatten()[1].long() + print(self.g_map[0, 0, y - 2 : y + 3, x - 2 : x + 3]) + print("loop in out_path", node_coords) + raise ValueError("loop in out_path") + return out_path, cost + out_path.append(node_coords) + done = torch.norm(node_coords - start_coords.cpu(), 2).item() < 0.3 + count1 += 1 + if count1 > 250: + break + return out_path, cost diff --git a/habitat-lab-dialog/habitat_baselines/slambased/reprojection.py b/habitat-lab-dialog/habitat_baselines/slambased/reprojection.py new file mode 100644 index 0000000..dd90721 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/slambased/reprojection.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from math import ceil, floor + +import numpy as np +import torch + + +def p_zx(p): + return p[(0, 2), 3] + + +def get_map_size_in_cells(map_size_in_meters, cell_size_in_meters): + return int(ceil(map_size_in_meters / cell_size_in_meters)) + 1 + + +def get_pos_diff(p_init, p_fin): + return p_zx(p_fin) - p_zx(p_init) + + +def get_distance(p_init, p_fin): + return torch.norm(get_pos_diff(p_init, p_fin)) + + +def get_pos_diffs(ps): + return ps[1:, (0, 2), 3] - ps[: (ps.size(0) - 1), (0, 2), 3] + + +def angle_to_pi_2_minus_pi_2(angle): + if angle < -np.pi: + angle = 2.0 * np.pi + angle + if angle > np.pi: + angle = -2.0 * np.pi + angle + return angle + + +def get_direction(p_init, p_fin, ang_th=0.2, pos_th=0.1): + pos_diff = get_pos_diff(p_init, p_fin) + if torch.norm(pos_diff, 2).item() < pos_th: + return 0 + else: + needed_angle = torch.atan2(pos_diff[1], pos_diff[0]) + current_angle = torch.atan2(p_init[2, 0], p_init[0, 0]) + to_rotate = angle_to_pi_2_minus_pi_2( + -np.pi / 2.0 + needed_angle - current_angle + ) + if torch.abs(to_rotate).item() < ang_th: + return 0 + return to_rotate + + +def reproject_local_to_global(xyz_local, p): + device = xyz_local.device + num, dim = xyz_local.size() + if dim == 3: + xyz = torch.cat( + [ + xyz_local, + torch.ones((num, 1), dtype=torch.float32, device=device), + ], + dim=1, + ) + elif dim == 4: + xyz = xyz_local + else: + raise ValueError( + "3d point cloud dim is neighter 3, or 4 (homogenious)" + ) + # print(xyz.shape, P.shape) + xyz_global = torch.mm(p.squeeze(), xyz.t()) + return xyz_global.t() + + +def project2d_pcl_into_worldmap(zx, map_size, cell_size): + device = zx.device + shift = int(floor(get_map_size_in_cells(map_size, cell_size) / 2.0)) + topdown2index = torch.tensor( + [[1.0 / cell_size, 0, shift], [0, 1.0 / cell_size, shift], [0, 0, 1]], + device=device, + ) + world_coords_h = torch.cat( + [zx.view(-1, 2), torch.ones((len(zx), 1), device=device)], dim=1 + ) + world_coords = torch.mm(topdown2index, world_coords_h.t()) + return world_coords.t()[:, :2] + + +def get_pose2d(poses6d): + poses6d = poses6d.view(-1, 4, 4) + poses2d = poses6d[:, (0, 2)] + poses2d = poses2d[:, :, (0, 2, 3)] + return poses2d + + +def get_rotation_matrix(angle_in_radians): + angle_in_radians = angle_in_radians.view(-1, 1, 1) + sin_a = torch.sin(angle_in_radians) + cos_a = torch.cos(angle_in_radians) + a1x = torch.cat([cos_a, sin_a], dim=2) + a2x = torch.cat([-sin_a, cos_a], dim=2) + transform = torch.cat([a1x, a2x], dim=1) + return transform + + +def normalize_zx_ori(p): + p2d = get_pose2d(p) + norms = torch.norm(p2d[:, 0, :2], dim=1).view(-1, 1, 1) + out = torch.cat( + [ + torch.cat( + [p[:, :3, :3] / norms.expand(p.size(0), 3, 3), p[:, 3:, :3]], + dim=1, + ), + p[:, :, 3:], + ], + dim=2, + ) + return out + + +def add_rot_wps(p): + planned_tps_norm = normalize_zx_ori(p) + pos_diffs = get_pos_diffs(planned_tps_norm) + + angles = torch.atan2(pos_diffs[:, 1], pos_diffs[:, 0]) + rotmats = get_rotation_matrix(angles) + planned_tps_norm[: p.size(0) - 1, 0, 0] = rotmats[:, 0, 0] + planned_tps_norm[: p.size(0) - 1, 0, 2] = rotmats[:, 0, 1] + planned_tps_norm[: p.size(0) - 1, 2, 0] = rotmats[:, 1, 0] + planned_tps_norm[: p.size(0) - 1, 2, 2] = rotmats[:, 1, 1] + + planned_points2 = planned_tps_norm.clone() + + planned_points2[1:, 0, 0] = planned_tps_norm[: p.size(0) - 1, 0, 0] + planned_points2[1:, 0, 2] = planned_tps_norm[: p.size(0) - 1, 0, 2] + planned_points2[1:, 2, 0] = planned_tps_norm[: p.size(0) - 1, 2, 0] + planned_points2[1:, 2, 2] = planned_tps_norm[: p.size(0) - 1, 2, 2] + out = torch.stack( + (planned_points2.unsqueeze(0), planned_tps_norm.unsqueeze(0)), dim=0 + ).squeeze() + out = out.permute(1, 0, 2, 3).contiguous().view(-1, 4, 4) + return out + + +def planned_path2tps(path, cell_size, map_size, agent_h, add_rot=False): + r"""Path is list of 2d coordinates from planner, in map cells. + tp is trajectory pose, 4x4 matrix - same format, + as in localization module + """ + path = torch.cat(path).view(-1, 2) + # print(path.size()) + num_pts = len(path) + planned_tps = torch.eye(4).unsqueeze(0).repeat((num_pts, 1, 1)) + planned_tps[:, 0, 3] = path[:, 1] # switch back x and z + planned_tps[:, 1, 3] = agent_h + planned_tps[:, 2, 3] = path[:, 0] # switch back x and z + shift = int(floor(get_map_size_in_cells(map_size, cell_size) / 2.0)) + planned_tps[:, 0, 3] = planned_tps[:, 0, 3] - shift + planned_tps[:, 2, 3] = planned_tps[:, 2, 3] - shift + p = torch.tensor( + [ + [1.0 / cell_size, 0, 0, 0], + [0, 1.0 / cell_size, 0, 0], + [0, 0, 1.0 / cell_size, 0], + [0, 0, 0, 1], + ] + ) + planned_tps = torch.bmm( + p.inverse().unsqueeze(0).expand(num_pts, 4, 4), planned_tps + ) + if add_rot: + return add_rot_wps(planned_tps) + return planned_tps + + +def habitat_goalpos_to_tp(ro_phi, p_curr): + r"""Convert distance and azimuth to + trajectory pose, 4x4 matrix - same format, + as in localization module + """ + device = ro_phi.device + offset = torch.tensor( + [ + -ro_phi[0] * torch.sin(ro_phi[1]), + 0, + ro_phi[0] * torch.cos(ro_phi[1]), + ] + ).to(device) + if p_curr.size(1) == 3: + p_curr = homogenize_p(p_curr) + goal_tp = torch.mm( + p_curr.to(device), + torch.cat( + [ + offset + * torch.tensor( + [1.0, 1.0, 1.0], dtype=torch.float32, device=device + ), + torch.tensor([1.0], device=device), + ] + ).reshape(4, 1), + ) + return goal_tp + + +def habitat_goalpos_to_mapgoal_pos(offset, p_curr, cell_size, map_size): + r"""Convert distance and azimuth to + map cell coordinates + """ + device = offset.device + goal_tp = habitat_goalpos_to_tp(offset, p_curr) + goal_tp1 = torch.eye(4).to(device) + goal_tp1[:, 3:] = goal_tp + projected_p = project_tps_into_worldmap( + goal_tp1.view(1, 4, 4), cell_size, map_size + ) + return projected_p + + +def homogenize_p(tps): + device = tps.device + tps = tps.view(-1, 3, 4) + return torch.cat( + [ + tps.float(), + torch.tensor([0, 0, 0, 1.0]) + .view(1, 1, 4) + .expand(tps.size(0), 1, 4) + .to(device), + ], + dim=1, + ) + + +def project_tps_into_worldmap(tps, cell_size, map_size, do_floor=True): + r"""Convert 4x4 pose matrices (trajectory poses) to + map cell coordinates + """ + if len(tps) == 0: + return [] + if isinstance(tps, list): + return [] + device = tps.device + topdown_p = torch.tensor([[1.0, 0, 0, 0], [0, 0, 1.0, 0]]).to(device) + world_coords = torch.bmm( + topdown_p.view(1, 2, 4).expand(tps.size(0), 2, 4), + tps[:, :, 3:].view(-1, 4, 1), + ) + shift = int(floor(get_map_size_in_cells(map_size, cell_size) / 2.0)) + topdown2index = torch.tensor( + [[1.0 / cell_size, 0, shift], [0, 1.0 / cell_size, shift], [0, 0, 1]] + ).to(device) + world_coords_h = torch.cat( + [world_coords, torch.ones((len(world_coords), 1, 1)).to(device)], dim=1 + ) + world_coords = torch.bmm( + topdown2index.unsqueeze(0).expand(world_coords_h.size(0), 3, 3), + world_coords_h, + )[:, :2, 0] + if do_floor: + return ( + torch.floor(world_coords.flip(1)) + 1 + ) # for having revesrve (z,x) ordering + return world_coords.flip(1) + + +def project_tps_into_worldmap_numpy(tps, slam_to_world, cell_size, map_size): + if len(tps) == 0: + return [] + if isinstance(tps, list): + return [] + # tps is expected in [n,4,4] format + topdown_p = np.array([[slam_to_world, 0, 0, 0], [0, 0, slam_to_world, 0]]) + try: + world_coords = np.matmul( + topdown_p.reshape(1, 2, 4), tps[:, :, 3:].reshape(-1, 4, 1) + ) + except BaseException: + return [] + shift = int(floor(get_map_size_in_cells(map_size, cell_size) / 2.0)) + topdown2index = np.array( + [[1.0 / cell_size, 0, shift], [0, 1.0 / cell_size, shift], [0, 0, 1]] + ) + world_coords_h = np.concatenate( + [world_coords, np.ones((len(world_coords), 1, 1))], axis=1 + ) + world_coords = np.matmul(topdown2index, world_coords_h)[:, :2, 0] + return ( + world_coords[:, ::-1].astype(np.int32) + 1 + ) # for having revesrve (z,x) ordering diff --git a/habitat-lab-dialog/habitat_baselines/slambased/requirements.txt b/habitat-lab-dialog/habitat_baselines/slambased/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/habitat-lab-dialog/habitat_baselines/slambased/utils.py b/habitat-lab-dialog/habitat_baselines/slambased/utils.py new file mode 100644 index 0000000..d81341d --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/slambased/utils.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import time + +import numpy as np +import torch +from PIL import Image + + +def generate_2dgrid(h, w, centered=False): + if centered: + x = torch.linspace(-w / 2 + 1, w / 2, w) + y = torch.linspace(-h / 2 + 1, h / 2, h) + else: + x = torch.linspace(0, w - 1, w) + y = torch.linspace(0, h - 1, h) + grid2d = torch.stack( + [y.repeat(w, 1).t().contiguous().view(-1), x.repeat(h)], 1 + ) + return grid2d.view(1, h, w, 2).permute(0, 3, 1, 2) + + +def str2bool(v): + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise ValueError(f"{v} cannot be converted to a bool") + + +def resize_pil(np_img, size=128): + im1 = Image.fromarray(np_img) + im1.thumbnail((size, size)) + return np.array(im1) + + +def find_map_size(h, w): + map_size_in_meters = int(0.1 * 3 * max(h, w)) + if map_size_in_meters % 10 != 0: + map_size_in_meters = map_size_in_meters + ( + 10 - (map_size_in_meters % 10) + ) + return map_size_in_meters + + +def gettimestr(): + return time.strftime("%Y-%m-%d--%H_%M_%S", time.gmtime()) diff --git a/habitat-lab-dialog/habitat_baselines/utils/__init__.py b/habitat-lab-dialog/habitat_baselines/utils/__init__.py new file mode 100644 index 0000000..e44b020 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/utils/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +__all__ = ["visualizations", "env_utils", "common"] diff --git a/habitat-lab-dialog/habitat_baselines/utils/common.py b/habitat-lab-dialog/habitat_baselines/utils/common.py new file mode 100644 index 0000000..4d0d639 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/utils/common.py @@ -0,0 +1,415 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import glob +import os +import re +import shutil +import tarfile +from collections import defaultdict +from io import BytesIO +from typing import ( + Any, + DefaultDict, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, +) + +import numpy as np +import torch +from gym.spaces import Box +from PIL import Image +from torch import Size, Tensor +from torch import nn as nn + +from habitat import logger +from habitat.core.dataset import Episode +from habitat.core.utils import try_cv2_import +from habitat.utils import profiling_wrapper +from habitat.utils.visualizations.utils import images_to_video +from habitat_baselines.common.tensor_dict import DictTree, TensorDict +from habitat_baselines.common.tensorboard_utils import TensorboardWriter + +cv2 = try_cv2_import() + + +class CustomFixedCategorical(torch.distributions.Categorical): # type: ignore + def sample( + self, sample_shape: Size = torch.Size() # noqa: B008 + ) -> Tensor: + return super().sample(sample_shape).unsqueeze(-1) + + def log_probs(self, actions: Tensor) -> Tensor: + return ( + super() + .log_prob(actions.squeeze(-1)) + .view(actions.size(0), -1) + .sum(-1) + .unsqueeze(-1) + ) + + def mode(self): + return self.probs.argmax(dim=-1, keepdim=True) + + +class CategoricalNet(nn.Module): + def __init__(self, num_inputs: int, num_outputs: int) -> None: + super().__init__() + + self.linear = nn.Linear(num_inputs, num_outputs) + + nn.init.orthogonal_(self.linear.weight, gain=0.01) + nn.init.constant_(self.linear.bias, 0) + + def forward(self, x: Tensor) -> CustomFixedCategorical: + x = self.linear(x) + return CustomFixedCategorical(logits=x) + + +def linear_decay(epoch: int, total_num_updates: int) -> float: + r"""Returns a multiplicative factor for linear value decay + + Args: + epoch: current epoch number + total_num_updates: total number of + + Returns: + multiplicative factor that decreases param value linearly + """ + return 1 - (epoch / float(total_num_updates)) + + +@torch.no_grad() +@profiling_wrapper.RangeContext("batch_obs") +def batch_obs( + observations: List[DictTree], + device: Optional[torch.device] = None, +) -> TensorDict: + r"""Transpose a batch of observation dicts to a dict of batched + observations. + + Args: + observations: list of dicts of observations. + device: The torch.device to put the resulting tensors on. + Will not move the tensors if None + + Returns: + transposed dict of torch.Tensor of observations. + """ + batch: DefaultDict[str, List] = defaultdict(list) + + for obs in observations: + for sensor in obs: + batch[sensor].append(torch.as_tensor(obs[sensor])) + + batch_t: TensorDict = TensorDict() + + for sensor in batch: + batch_t[sensor] = torch.stack(batch[sensor], dim=0) + + return batch_t.map(lambda v: v.to(device)) + + +def get_checkpoint_id(ckpt_path: str) -> Optional[int]: + r"""Attempts to extract the ckpt_id from the filename of a checkpoint. + Assumes structure of ckpt.ID.path . + + Args: + ckpt_path: the path to the ckpt file + + Returns: + returns an int if it is able to extract the ckpt_path else None + """ + ckpt_path = os.path.basename(ckpt_path) + nums: List[int] = [int(s) for s in ckpt_path.split(".") if s.isdigit()] + if len(nums) > 0: + return nums[-1] + return None + + +def poll_checkpoint_folder( + checkpoint_folder: str, previous_ckpt_ind: int +) -> Optional[str]: + r"""Return (previous_ckpt_ind + 1)th checkpoint in checkpoint folder + (sorted by time of last modification). + + Args: + checkpoint_folder: directory to look for checkpoints. + previous_ckpt_ind: index of checkpoint last returned. + + Returns: + return checkpoint path if (previous_ckpt_ind + 1)th checkpoint is found + else return None. + """ + assert os.path.isdir(checkpoint_folder), ( + f"invalid checkpoint folder " f"path {checkpoint_folder}" + ) + models_paths = list( + filter(os.path.isfile, glob.glob(checkpoint_folder + "/*")) + ) + models_paths.sort(key=os.path.getmtime) + ind = previous_ckpt_ind + 1 + if ind < len(models_paths): + return models_paths[ind] + return None + + +def generate_video( + video_option: List[str], + video_dir: Optional[str], + images: List[np.ndarray], + episode_id: Union[int, str], + checkpoint_idx: int, + metrics: Dict[str, float], + tb_writer: TensorboardWriter, + fps: int = 10, +) -> None: + r"""Generate video according to specified information. + + Args: + video_option: string list of "tensorboard" or "disk" or both. + video_dir: path to target video directory. + images: list of images to be converted to video. + episode_id: episode id for video naming. + checkpoint_idx: checkpoint index for video naming. + metric_name: name of the performance metric, e.g. "spl". + metric_value: value of metric. + tb_writer: tensorboard writer object for uploading video. + fps: fps for generated video. + Returns: + None + """ + if len(images) < 1: + return + + metric_strs = [] + for k, v in metrics.items(): + metric_strs.append(f"{k}={v:.2f}") + + video_name = f"episode={episode_id}-ckpt={checkpoint_idx}-" + "-".join( + metric_strs + ) + if "disk" in video_option: + assert video_dir is not None + images_to_video(images, video_dir, video_name) + if "tensorboard" in video_option: + tb_writer.add_video_from_np_images( + f"episode{episode_id}", checkpoint_idx, images, fps=fps + ) + + +def tensor_to_depth_images(tensor: Union[torch.Tensor, List]) -> np.ndarray: + r"""Converts tensor (or list) of n image tensors to list of n images. + Args: + tensor: tensor containing n image tensors + Returns: + list of images + """ + images = [] + + for img_tensor in tensor: + image = img_tensor.permute(1, 2, 0).cpu().numpy() * 255 + images.append(image) + + return images + + +def tensor_to_bgr_images( + tensor: Union[torch.Tensor, Iterable[torch.Tensor]] +) -> List[np.ndarray]: + r"""Converts tensor of n image tensors to list of n BGR images. + Args: + tensor: tensor containing n image tensors + Returns: + list of images + """ + images = [] + + for img_tensor in tensor: + img = img_tensor.permute(1, 2, 0).cpu().numpy() * 255 + img = img.astype(np.uint8) + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + images.append(img) + + return images + + +def image_resize_shortest_edge( + img: Tensor, size: int, channels_last: bool = False +) -> torch.Tensor: + """Resizes an img so that the shortest side is length of size while + preserving aspect ratio. + + Args: + img: the array object that needs to be resized (HWC) or (NHWC) + size: the size that you want the shortest edge to be resize to + channels: a boolean that channel is the last dimension + Returns: + The resized array as a torch tensor. + """ + img = torch.as_tensor(img) + no_batch_dim = len(img.shape) == 3 + if len(img.shape) < 3 or len(img.shape) > 5: + raise NotImplementedError() + if no_batch_dim: + img = img.unsqueeze(0) # Adds a batch dimension + h, w = get_image_height_width(img, channels_last=channels_last) + if channels_last: + if len(img.shape) == 4: + # NHWC -> NCHW + img = img.permute(0, 3, 1, 2) + else: + # NDHWC -> NDCHW + img = img.permute(0, 1, 4, 2, 3) + + # Percentage resize + scale = size / min(h, w) + h = int(h * scale) + w = int(w * scale) + img = torch.nn.functional.interpolate( + img.float(), size=(h, w), mode="area" + ).to(dtype=img.dtype) + if channels_last: + if len(img.shape) == 4: + # NCHW -> NHWC + img = img.permute(0, 2, 3, 1) + else: + # NDCHW -> NDHWC + img = img.permute(0, 1, 3, 4, 2) + if no_batch_dim: + img = img.squeeze(dim=0) # Removes the batch dimension + return img + + +def center_crop( + img: Tensor, size: Union[int, Tuple[int, int]], channels_last: bool = False +) -> Tensor: + """Performs a center crop on an image. + + Args: + img: the array object that needs to be resized (either batched or unbatched) + size: A sequence (h, w) or a python(int) that you want cropped + channels_last: If the channels are the last dimension. + Returns: + the resized array + """ + h, w = get_image_height_width(img, channels_last=channels_last) + + if isinstance(size, int): + size_tuple: Tuple[int, int] = (int(size), int(size)) + else: + size_tuple = size + assert len(size_tuple) == 2, "size should be (h,w) you wish to resize to" + cropy, cropx = size_tuple + + startx = w // 2 - (cropx // 2) + starty = h // 2 - (cropy // 2) + if channels_last: + return img[..., starty : starty + cropy, startx : startx + cropx, :] + else: + return img[..., starty : starty + cropy, startx : startx + cropx] + + +def get_image_height_width( + img: Union[Box, np.ndarray, torch.Tensor], channels_last: bool = False +) -> Tuple[int, int]: + if img.shape is None or len(img.shape) < 3 or len(img.shape) > 5: + raise NotImplementedError() + if channels_last: + # NHWC + h, w = img.shape[-3:-1] + else: + # NCHW + h, w = img.shape[-2:] + return h, w + + +def overwrite_gym_box_shape(box: Box, shape) -> Box: + if box.shape == shape: + return box + shape = list(shape) + list(box.shape[len(shape) :]) + low = box.low if np.isscalar(box.low) else np.min(box.low) + high = box.high if np.isscalar(box.high) else np.max(box.high) + return Box(low=low, high=high, shape=shape, dtype=box.dtype) + + +def get_scene_episode_dict(episodes: List[Episode]) -> Dict: + scene_ids = [] + scene_episode_dict = {} + + for episode in episodes: + if episode.scene_id not in scene_ids: + scene_ids.append(episode.scene_id) + scene_episode_dict[episode.scene_id] = [episode] + else: + scene_episode_dict[episode.scene_id].append(episode) + + return scene_episode_dict + + +def base_plus_ext(path: str) -> Union[Tuple[str, str], Tuple[None, None]]: + """Helper method that splits off all extension. + Returns base, allext. + path: path with extensions + returns: path with all extensions removed + """ + match = re.match(r"^((?:.*/|)[^.]+)[.]([^/]*)$", path) + if not match: + return None, None + return match.group(1), match.group(2) + + +def valid_sample(sample: Optional[Any]) -> bool: + """Check whether a webdataset sample is valid. + sample: sample to be checked + """ + return ( + sample is not None + and isinstance(sample, dict) + and len(list(sample.keys())) > 0 + and not sample.get("__bad__", False) + ) + + +def img_bytes_2_np_array( + x: Tuple[int, torch.Tensor, bytes] +) -> Tuple[int, torch.Tensor, bytes, np.ndarray]: + """Mapper function to convert image bytes in webdataset sample to numpy + arrays. + Args: + x: webdataset sample containing ep_id, question, answer and imgs + Returns: + Same sample with bytes turned into np arrays. + """ + images = [] + img_bytes: bytes + for img_bytes in x[3:]: + bytes_obj = BytesIO() + bytes_obj.write(img_bytes) + image = np.array(Image.open(bytes_obj)) + img = image.transpose(2, 0, 1) + img = img / 255.0 + images.append(img) + return (*x[0:3], np.array(images, dtype=np.float32)) + + +def create_tar_archive(archive_path: str, dataset_path: str) -> None: + """Creates tar archive of dataset and returns status code. + Used in VQA trainer's webdataset. + """ + logger.info("[ Creating tar archive. This will take a few minutes. ]") + + with tarfile.open(archive_path, "w:gz") as tar: + for file in sorted(os.listdir(dataset_path)): + tar.add(os.path.join(dataset_path, file)) + + +def delete_folder(path: str) -> None: + shutil.rmtree(path) diff --git a/habitat-lab-dialog/habitat_baselines/utils/env_utils.py b/habitat-lab-dialog/habitat_baselines/utils/env_utils.py new file mode 100644 index 0000000..d0dbd25 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/utils/env_utils.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import random +from typing import List, Type, Union + +import habitat +from habitat import Config, Env, RLEnv, VectorEnv, make_dataset + + +def make_env_fn( + config: Config, env_class: Union[Type[Env], Type[RLEnv]] +) -> Union[Env, RLEnv]: + r"""Creates an env of type env_class with specified config and rank. + This is to be passed in as an argument when creating VectorEnv. + + Args: + config: root exp config that has core env config node as well as + env-specific config node. + env_class: class type of the env to be created. + + Returns: + env object created according to specification. + """ + dataset = make_dataset( + config.TASK_CONFIG.DATASET.TYPE, config=config.TASK_CONFIG.DATASET + ) + env = env_class(config=config, dataset=dataset) + env.seed(config.TASK_CONFIG.SEED) + return env + + +def construct_envs( + config: Config, + env_class: Union[Type[Env], Type[RLEnv]], + workers_ignore_signals: bool = False, +) -> VectorEnv: + r"""Create VectorEnv object with specified config and env class type. + To allow better performance, dataset are split into small ones for + each individual env, grouped by scenes. + + :param config: configs that contain num_environments as well as information + :param necessary to create individual environments. + :param env_class: class type of the envs to be created. + :param workers_ignore_signals: Passed to :ref:`habitat.VectorEnv`'s constructor + + :return: VectorEnv object created according to specification. + """ + + num_environments = config.NUM_ENVIRONMENTS + configs = [] + env_classes = [env_class for _ in range(num_environments)] + dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE) + scenes = config.TASK_CONFIG.DATASET.CONTENT_SCENES + if "*" in config.TASK_CONFIG.DATASET.CONTENT_SCENES: + scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET) + + if num_environments > 1: + if len(scenes) == 0: + raise RuntimeError( + "No scenes to load, multiple process logic relies on being able to split scenes uniquely between processes" + ) + + if len(scenes) < num_environments: + raise RuntimeError( + "reduce the number of environments as there " + "aren't enough number of scenes.\n" + "num_environments: {}\tnum_scenes: {}".format( + num_environments, len(scenes) + ) + ) + + random.shuffle(scenes) + + scene_splits: List[List[str]] = [[] for _ in range(num_environments)] + for idx, scene in enumerate(scenes): + scene_splits[idx % len(scene_splits)].append(scene) + + assert sum(map(len, scene_splits)) == len(scenes) + + for i in range(num_environments): + proc_config = config.clone() + proc_config.defrost() + + task_config = proc_config.TASK_CONFIG + task_config.SEED = task_config.SEED + i + if len(scenes) > 0: + task_config.DATASET.CONTENT_SCENES = scene_splits[i] + + task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = ( + config.SIMULATOR_GPU_ID + ) + + task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS + + proc_config.freeze() + configs.append(proc_config) + + envs = habitat.VectorEnv( + make_env_fn=make_env_fn, + env_fn_args=tuple(zip(configs, env_classes)), + workers_ignore_signals=workers_ignore_signals, + ) + return envs diff --git a/habitat-lab-dialog/habitat_baselines/utils/visualizations/__init__.py b/habitat-lab-dialog/habitat_baselines/utils/visualizations/__init__.py new file mode 100644 index 0000000..19360f7 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/utils/visualizations/__init__.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat_baselines.utils.visualizations import utils + +__all__ = ["utils"] diff --git a/habitat-lab-dialog/habitat_baselines/utils/visualizations/utils.py b/habitat-lab-dialog/habitat_baselines/utils/visualizations/utils.py new file mode 100644 index 0000000..5bb7de9 --- /dev/null +++ b/habitat-lab-dialog/habitat_baselines/utils/visualizations/utils.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import numpy as np +import torch + +from habitat.core.utils import try_cv2_import +from habitat_baselines.utils.common import ( + tensor_to_bgr_images, + tensor_to_depth_images, +) + +try: + from habitat_sim.utils.common import d3_40_colors_rgb +except ImportError: + d3_40_colors_rgb = None + +cv2 = try_cv2_import() + + +def save_rgb_results( + gt_rgb: torch.Tensor, pred_rgb: torch.Tensor, path: str +) -> None: + r"""For saving RGB reconstruction results during EQA-CNN-Pretrain eval. + + Args: + gt_rgb: RGB ground truth tensor + pred_rgb: RGB reconstruction tensor + path: to save images + """ + path = path.format(split="val", type="rgb") + gt_bgr_o, pred_bgr = tensor_to_bgr_images([gt_rgb, pred_rgb]) + cv2.imwrite(path + "_gt.jpg", gt_bgr_o) + cv2.imwrite(path + "_pred.jpg", pred_bgr) + + +def save_seg_results( + gt_seg: torch.Tensor, pred_seg: torch.Tensor, path: str +) -> None: + r"""For saving predicted and ground truth seg maps during + EQA-CNN-Pretrain eval. + + Args: + gt_seg: ground truth segmentation tensor + pred_seg: ouput segmentation tensor + path: to save images + """ + + path = path.format(split="val", type="seg") + + gt_seg = gt_seg.cpu().numpy() % 40 + pred_seg = torch.argmax(pred_seg, 0).cpu().numpy() % 40 + + gt_seg_colored = d3_40_colors_rgb[gt_seg] + pred_seg_colored = d3_40_colors_rgb[pred_seg] + + cv2.imwrite(path + "_gt.jpg", gt_seg_colored) + cv2.imwrite(path + "_pred.jpg", pred_seg_colored) + + +def save_depth_results( + gt_depth: torch.Tensor, pred_depth: torch.Tensor, path: str +) -> None: + r"""For saving predicted and ground truth depth maps during + EQA-CNN-Pretrain eval. + + Args: + gt_depth: ground truth depth tensor + pred_depth: ouput depth tensor + path: to save images + """ + path = path.format(split="val", type="depth") + + gt_depth, pred_depth = tensor_to_depth_images([gt_depth, pred_depth]) + + cv2.imwrite(path + "_gt.jpg", gt_depth) + cv2.imwrite(path + "_pred.jpg", pred_depth) + + +def put_vqa_text_on_image( + image: np.ndarray, + question: str, + prediction: str, + ground_truth: str, +) -> np.ndarray: + r"""For writing VQA question, prediction and ground truth answer + on image. + Args: + image: image on which text has to be written + question: input question to model + prediction: model's answer prediction + ground_truth: ground truth answer + Returns: + image with text + """ + font = cv2.FONT_HERSHEY_SIMPLEX + color = (0, 0, 0) + scale = 0.4 + thickness = 1 + + cv2.putText( + image, + "Question: " + question, + (10, 15), + font, + scale, + color, + thickness, + ) + cv2.putText( + image, + "Prediction: " + prediction, + (10, 30), + font, + scale, + color, + thickness, + ) + cv2.putText( + image, + "Ground truth: " + ground_truth, + (10, 45), + font, + scale, + color, + thickness, + ) + + return image + + +def save_vqa_image_results( + images_tensor: torch.Tensor, + question: str, + prediction: str, + ground_truth: str, + path: str, +) -> None: + r"""For saving VQA input images with input question and predicted answer. + Being used to save model predictions during eval. + Args: + images_tensor: images' tensor containing input frames + question: input question to model + prediction: model's answer prediction + ground_truth: ground truth answer + path: to save images + Returns: + None + """ + + images = tensor_to_bgr_images(images_tensor) + + collage_image = cv2.hconcat(images) + collage_image = cv2.copyMakeBorder( + collage_image, + 55, + 0, + 0, + 0, + cv2.BORDER_CONSTANT, + value=(255, 255, 255), + ) + + image = put_vqa_text_on_image( + collage_image, question, prediction, ground_truth + ) + + cv2.imwrite(path, image) diff --git a/habitat-lab-dialog/mypy.ini b/habitat-lab-dialog/mypy.ini new file mode 100644 index 0000000..3bad1d0 --- /dev/null +++ b/habitat-lab-dialog/mypy.ini @@ -0,0 +1,39 @@ +[mypy] +disable_error_code=override + +# do not follow imports (except for ones found in typeshed) +ignore_missing_imports = True +#Ignore errors for third parties +ignore_errors = True +follow_imports = silent + +# treat Optional per PEP 484 +strict_optional = False + +warn_unused_configs = True +warn_redundant_casts = True +# ensure all execution paths are returning +warn_no_return= True +warn_unreachable = True +allow_redefinition = True + +show_error_codes = True +check_untyped_defs = True + + +files= + habitat, + habitat_baselines, + test +python_version = 3.6 + +# Third Party Dependencies + +[mypy-habitat.*] +ignore_errors = False + +[mypy-habitat_sim.*] +ignore_errors = False + +[mypy-habitat_baselines.*] +ignore_errors = False diff --git a/habitat-lab-dialog/pyproject.toml b/habitat-lab-dialog/pyproject.toml new file mode 100644 index 0000000..deea632 --- /dev/null +++ b/habitat-lab-dialog/pyproject.toml @@ -0,0 +1,22 @@ +[tool.black] +line_length = 79 +exclude = ''' +( + /( + \.eggs # exclude a few common directories in the + | \.git # root of the project + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | ^examples/tutorials/colabs + | ^examples/tutorials/nb_python + | build + | dist + | obselete + | deps + )/ +) +''' diff --git a/habitat-lab-dialog/requirements.txt b/habitat-lab-dialog/requirements.txt new file mode 100644 index 0000000..ab88813 --- /dev/null +++ b/habitat-lab-dialog/requirements.txt @@ -0,0 +1,13 @@ +gym>=0.17.3 +numpy>=1.16.1 +yacs>=0.1.5 +numpy-quaternion>=2019.3.18.14.33.20 +attrs>=19.1.0 +opencv-python>=3.3.0 +pickle5; python_version < '3.8' +# visualization optional dependencies +imageio>=2.2.0 +imageio-ffmpeg>=0.2.0 +scipy>=1.0.0 +tqdm>=4.0.0 +numba>=0.44.0 diff --git a/habitat-lab-dialog/res/img/habitat_compressed.gif b/habitat-lab-dialog/res/img/habitat_compressed.gif new file mode 100644 index 0000000..bca90e0 Binary files /dev/null and b/habitat-lab-dialog/res/img/habitat_compressed.gif differ diff --git a/habitat-lab-dialog/res/img/habitat_lab_structure.png b/habitat-lab-dialog/res/img/habitat_lab_structure.png new file mode 100644 index 0000000..5598121 Binary files /dev/null and b/habitat-lab-dialog/res/img/habitat_lab_structure.png differ diff --git a/habitat-lab-dialog/res/img/habitat_logo_with_text_horizontal_blue.png b/habitat-lab-dialog/res/img/habitat_logo_with_text_horizontal_blue.png new file mode 100644 index 0000000..39a234f Binary files /dev/null and b/habitat-lab-dialog/res/img/habitat_logo_with_text_horizontal_blue.png differ diff --git a/habitat-lab-dialog/res/img/tensorboard_video_demo.gif b/habitat-lab-dialog/res/img/tensorboard_video_demo.gif new file mode 100644 index 0000000..e7ed86f Binary files /dev/null and b/habitat-lab-dialog/res/img/tensorboard_video_demo.gif differ diff --git a/habitat-lab-dialog/scripts/generate_profile_shell_scripts.py b/habitat-lab-dialog/scripts/generate_profile_shell_scripts.py new file mode 100644 index 0000000..345f328 --- /dev/null +++ b/habitat-lab-dialog/scripts/generate_profile_shell_scripts.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +r"""Reference python script for profiling DDPPO PointNav on the FAIR internal +cluster used by the Habitat team. This script can be modified locally to suit +your needs, for example, to profile a different program or to profile with +different settings. + +For an overview of profiling and optimization in Habitat, see: +https://colab.research.google.com/gist/eundersander/b62bb497519b44cf4ceb10e2079525dc/faster-rl-training-profiling-and-optimization.ipynb + +This script's intended usage is: +1. Review and locally edit the documented options at the top of this file. +2. Run this python script to generate profiling shell script(s). +3. See the printed instructions and run the shell script. +""" + +import os + +if __name__ == "__main__": + + # The Habitat-lab program to be profiled (the command you usually use to + # invoke it). + program_str = "python -u -m habitat_baselines.run --exp-config habitat_baselines/config/pointnav/ddppo_pointnav.yaml --run-type train" + + # Path to Nsight Systems nsys command-line tool. This hard-coded path is + # for the FAIR cluster. + nsys_path = "/private/home/eundersander/nsight-systems-2020.3.1/bin/nsys" + + # You can either capture a step range or a time range. Capturing a step + # range is generally a better workflow, but it requires integrating + # profiling_utils.configure into your train program (beware, + # profiling_utils.configure is not yet merged into Habitat-sim). + do_capture_step_range = True + + if do_capture_step_range: + # "Step" here refers to however you defined a train step in your train + # program. See habitat-sim profiling_utils.configure. Prefer capturing a + # range of steps that are representative of your entire train job, in + # terms of the time spent in various parts of your program. Early train + # steps may suffer from poor agent behavior, too-short episodes, etc. If + # necessary, capture and inspect a very long-duration profile to + # determine when your training FPS "settles". + # DDPPO PointNav empirical test from Aug 2020, 8 nodes: + # FPS settled at ~190 steps + # DDPPO PointNav empirical test from Oct 2020, 2 nodes: + # FPS settled at ~1200 steps + capture_start_step = 1200 + + # If you're focusing on optimizing the train loop body (work that + # happens consistently every update), you don't need a large number + # here. However, beware overlooking infrequent events like env resets, + # scene loads, checkpointing, and eval. Beware profile storage + # requirement. DDPPO PointNav empirical test from Aug 2020: + # qdrep: 3.3 MB per 100 steps + # sqlite: 12 MB per 100 steps + # These figures are for a single task (see capture_all_tasks below). + num_steps_to_capture = 100 + else: + nsys_capture_delay_seconds = 120 + nsys_capture_duration_seconds = 120 + + # Launch the program distributed, using slurm. See also slurm_submit_str + # below for more slurm parameters like ntasks-per-node. + do_slurm = True + + # Path can be absolute or relative to the working directory (where you + # run the profiling shell script, which is probably the habitat-lab + # root directory). + profile_output_folder = "profiles" + + if do_slurm: + # You must use ${SLURM_NODEID} and ${SLURM_LOCALID} if using + # capture_all_tasks so that each profile gets a unique name. Use of + # ${SLURM_JOB_ID} is optional. + profile_output_filename_base = "profile_job${SLURM_JOB_ID}_node${SLURM_NODEID}_local${SLURM_LOCALID}" + else: + profile_output_filename_base = "local_profile" + + if do_slurm: + # A job duration to provide to slurm. Provide a reasonable upper bound + # here. It's not important to provide a tight bound. A too-short + # duration will cause your slurm job to terminate before profiles are + # saved. A much-too-large duration may result in a longer wait time + # before slurm starts your job. + # DDPPO PointNav empirical test from Aug 2020, 8 nodes: + # startup time is 2 minutes and 100 steps takes 12 minutes + # DDPPO PointNav empirical test from Oct 2020, 2 nodes: + # startup time is 2 minutes and 100 steps takes 5.9 minutes + buffered_start_minutes = 10 + buffered_minutes_per_100_steps = 8 + if do_capture_step_range: + slurm_job_termination_minutes = buffered_start_minutes + int( + (capture_start_step + num_steps_to_capture) + * buffered_minutes_per_100_steps + / 100 + ) + else: + slurm_job_termination_minutes = ( + nsys_capture_delay_seconds + nsys_capture_duration_seconds + ) * 60 + 5 + + # If capture_all_tasks==True, we capture profiles for all tasks. Beware + # large profile storage requirement in this case. If False, only one + # task runs with profiling. The other tasks run without profiling. In + # theory, all tasks behave similarly and so a single task's profile is + # representative of all tasks. In my DDPPO PointNav empirical test from + # Aug 2020, this was true. + capture_all_tasks = False + + # Useful for understanding your program's CUDA usage on the GPU. Beware + # large profile storage requirement. + capture_cuda = False + + # Beware, support is poor on the FAIR cluster and Colab machines due to + # older Nvidia drivers. For best OpenGL profiling, profile your desktop + # linux machine using the Nsight Systems GUI, not the nsys command-line + # tool. + capture_opengl = False + + # nsys produces a .qdrep multithreaded trace file which can be viewed in the + # Nsight GUI. Optionally, it can also export a .sqlite database file for use + # with habitat-sim's compare_profiles.py helper script. + export_sqlite = True + + # This is the end of the user-facing options, except see slurm_submit_str + # below. + # ========================================================================= + + if do_capture_step_range: + program_with_extra_args_str = ( + program_str + + " PROFILING.CAPTURE_START_STEP " + + str(capture_start_step) + + " PROFILING.NUM_STEPS_TO_CAPTURE " + + str(num_steps_to_capture) + ) + else: + program_with_extra_args_str = program_str + + if do_capture_step_range: + capture_range_args = '--capture-range=nvtx -p "habitat_capture_range" --stop-on-range-end=true' + else: + capture_range_args = ( + "--delay=" + + str(nsys_capture_delay_seconds) + + " --duration=" + + str(nsys_capture_duration_seconds) + ) + + task_capture_str = ( + """export HABITAT_PROFILING=1 +export NSYS_NVTX_PROFILER_REGISTER_ONLY=0 +""" + + nsys_path + + " profile --sample=none --trace-fork-before-exec=true --force-overwrite=true --trace=nvtx" + + (",cuda" if capture_cuda else "") + + (",opengl" if capture_opengl else "") + + " " + + capture_range_args + + ' --output="' + + profile_output_folder + + "/" + + profile_output_filename_base + + '" ' + + ("--export=sqlite" if export_sqlite else "") + + " " + + program_with_extra_args_str + ) + + if do_slurm: + if capture_all_tasks: + slurm_task_str = ( + """#!/bin/sh +""" + + task_capture_str + + """ +""" + ) + else: + slurm_task_str = ( + """#!/bin/sh +if [ "${SLURM_NODEID}" = "0" ] && [ "${SLURM_LOCALID}" = "0" ] +then +""" + + task_capture_str + + """ +else +""" + + program_str + + """ +fi +""" + ) + + slurm_submit_str = ( + """#!/bin/bash +#SBATCH --job-name=capture_profile +#SBATCH --output=/checkpoint/%u/jobs/job.%j.out +#SBATCH --error=/checkpoint/%u/jobs/job.%j.err +#SBATCH --gpus-per-task 1 +#SBATCH --nodes 2 +#SBATCH --cpus-per-task 10 +#SBATCH --ntasks-per-node 8 +#SBATCH --mem-per-cpu=5GB +#SBATCH --partition=dev +#SBATCH --time=""" + + str(slurm_job_termination_minutes) + + """:00 +#SBATCH --open-mode=append +export GLOG_minloglevel=2 +export MAGNUM_LOG=quiet +MASTER_ADDR=$(srun --ntasks=1 hostname 2>&1 | tail -n1) +export MASTER_ADDR +set -x +srun bash capture_profile_slurm_task.sh +""" + ) + + profile_output_filepath = ( + profile_output_folder + "/" + profile_output_filename_base + ".qdrep" + ) + if not do_slurm and os.path.exists(profile_output_filepath): + print( + "warning: {} already exists and will be overwritten.".format( + profile_output_filepath + ) + ) + + if not os.path.exists(profile_output_folder): + os.makedirs(profile_output_folder) + print("created directory: " + profile_output_folder) + + if do_slurm: + with open("capture_profile_slurm_task.sh", "w") as f: + f.write(slurm_task_str) + print("wrote capture_profile_slurm_task.sh") + + with open("capture_profile_slurm.sh", "w") as f: + f.write(slurm_submit_str) + print("wrote capture_profile_slurm.sh") + + print( + "\nTo start capture, do:\nchmod +x capture_profile_slurm_task.sh\nchmod +x capture_profile_slurm.sh\nsbatch capture_profile_slurm.sh" + ) + + else: + with open("capture_profile.sh", "w") as f: + f.write(task_capture_str) + print("wrote capture_profile.sh") + + print( + "\nTo start capture, do:\nchmod +x capture_profile.sh\n./capture_profile.sh" + ) diff --git a/habitat-lab-dialog/setup.cfg b/habitat-lab-dialog/setup.cfg new file mode 100644 index 0000000..7a6c7a0 --- /dev/null +++ b/habitat-lab-dialog/setup.cfg @@ -0,0 +1,35 @@ +[aliases] +test=pytest + +[flake8] +select = A,B,C,F,R,W,SIM +exclude = + .git, + __pycache__, + build, + data, + dist, + docs, + src/deps +max-line-length = 88 +# A003 prevents class attrs from having builtin name properties +# C401, and C402 are ignored to make scanning between dict and set easy +# C408 ignored because we like the dict keyword argument syntax +# R504 has some false positives since it doesn't care about side effects +# W503 is incompatible with Black +# SIM105 is a nice suggestion but except: ImportError is also really readable +# SIM106 has too many false positives +# SIM113 has too many false positives +ignore = + A003, + C401,C402,C408, + SIM105,SIM106,SIM113, + R504, + W503, +per-file-ignores = + */__init__.py:F401 + examples/tutorials/nb_python/*.py:B008,F841 + +[tool:pytest] +addopts = --verbose -rsxX -q +testpaths = test diff --git a/habitat-lab-dialog/setup.py b/habitat-lab-dialog/setup.py new file mode 100644 index 0000000..32b4570 --- /dev/null +++ b/habitat-lab-dialog/setup.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import glob +import os.path +import sys + +import setuptools +from setuptools.command.develop import develop as DefaultDevelopCommand +from setuptools.command.install import install as DefaultInstallCommand + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "habitat")) +from version import VERSION # isort:skip noqa + + +with open("README.md", encoding="utf8") as f: + readme = f.read() + +with open("LICENSE") as f: + license_text = f.read() + +with open("requirements.txt") as f: + reqs = f.read() + +DISTNAME = "habitat" +DESCRIPTION = "habitat: a suite for embodied agent tasks and benchmarks" +LONG_DESCRIPTION = readme +AUTHOR = "Facebook AI Research" +LICENSE = license_text +REQUIREMENTS = reqs.strip().split("\n") +BASELINE_PATH = ["habitat_baselines", "habitat_baselines.*"] +DEFAULT_EXCLUSION = ["test", "examples"] +FULL_REQUIREMENTS = set() +# collect requirements.txt file in all subdirectories +for file_name in ["requirements.txt"] + glob.glob( + "habitat_baselines/**/requirements.txt", recursive=True +): + with open(file_name) as f: + reqs = f.read() + FULL_REQUIREMENTS.update(reqs.strip().split("\n")) + + +class OptionedCommand: + r"""Generic Command class that takes extra user options and modifies + arguments in setuptools.setup() accordingly. + Though OptionedCommand inherits directly from object, it assumes + inheritance from DefaultDevelopCommand or DefaultInstallCommand, as it + overrides methods from those two classes. + """ + + user_options = [("all", None, "include habitat_baselines in installation")] + + def initialize_options(self): + super().initialize_options() + self.all = None + + def run(self): + if not self.all: # install core only + DEFAULT_EXCLUSION.extend(BASELINE_PATH) + self.distribution.packages = setuptools.find_packages( + exclude=DEFAULT_EXCLUSION + ) + # self.distribution accesses arguments of setup() in main() + else: # install all except test and examples + self.distribution.install_requires = FULL_REQUIREMENTS + super().run() + + +class InstallCommand(OptionedCommand, DefaultInstallCommand): + user_options = ( + getattr(DefaultInstallCommand, "user_options", []) + + OptionedCommand.user_options + ) + + +class DevelopCommand(OptionedCommand, DefaultDevelopCommand): + user_options = ( + getattr(DefaultDevelopCommand, "user_options", []) + + OptionedCommand.user_options + ) + + +if __name__ == "__main__": + setuptools.setup( + name=DISTNAME, + install_requires=REQUIREMENTS, + packages=setuptools.find_packages(exclude=DEFAULT_EXCLUSION), + version=VERSION, + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + author=AUTHOR, + license=LICENSE, + setup_requires=["pytest-runner"], + tests_require=["pytest-cov", "pytest-mock", "pytest"], + include_package_data=True, + cmdclass={"install": InstallCommand, "develop": DevelopCommand}, + ) diff --git a/habitat-lab-dialog/test/test_baseline_agents.py b/habitat-lab-dialog/test/test_baseline_agents.py new file mode 100644 index 0000000..e0072cc --- /dev/null +++ b/habitat-lab-dialog/test/test_baseline_agents.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +import os + +import pytest + +import habitat + +try: + from habitat_baselines.agents import ppo_agents, simple_agents + + baseline_installed = True +except ImportError: + baseline_installed = False + +CFG_TEST = "configs/test/habitat_all_sensors_test.yaml" + + +@pytest.mark.skipif( + not baseline_installed, reason="baseline sub-module not installed" +) +@pytest.mark.parametrize( + "input_type,resolution", + [ + (i_type, resolution) + for i_type, resolution in itertools.product( + ["blind", "rgb", "depth", "rgbd"], [256, 384] + ) + ], +) +def test_ppo_agents(input_type, resolution): + + agent_config = ppo_agents.get_default_config() + agent_config.MODEL_PATH = "" + agent_config.defrost() + config_env = habitat.get_config(config_paths=CFG_TEST) + if not os.path.exists(config_env.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + + benchmark = habitat.Benchmark(config_paths=CFG_TEST) + + config_env.defrost() + config_env.SIMULATOR.AGENT_0.SENSORS = [] + if input_type in ["rgb", "rgbd"]: + config_env.SIMULATOR.AGENT_0.SENSORS += ["RGB_SENSOR"] + agent_config.RESOLUTION = resolution + config_env.SIMULATOR.RGB_SENSOR.WIDTH = resolution + config_env.SIMULATOR.RGB_SENSOR.HEIGHT = resolution + if input_type in ["depth", "rgbd"]: + config_env.SIMULATOR.AGENT_0.SENSORS += ["DEPTH_SENSOR"] + agent_config.RESOLUTION = resolution + config_env.SIMULATOR.DEPTH_SENSOR.WIDTH = resolution + config_env.SIMULATOR.DEPTH_SENSOR.HEIGHT = resolution + + config_env.freeze() + + del benchmark._env + benchmark._env = habitat.Env(config=config_env) + agent_config.INPUT_TYPE = input_type + + agent = ppo_agents.PPOAgent(agent_config) + habitat.logger.info(benchmark.evaluate(agent, num_episodes=10)) + + +@pytest.mark.skipif( + not baseline_installed, reason="baseline sub-module not installed" +) +def test_simple_agents(): + config_env = habitat.get_config(config_paths=CFG_TEST) + + if not os.path.exists(config_env.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + + benchmark = habitat.Benchmark(config_paths=CFG_TEST) + + for agent_class in [ + simple_agents.ForwardOnlyAgent, + simple_agents.GoalFollower, + simple_agents.RandomAgent, + simple_agents.RandomForwardAgent, + ]: + agent = agent_class( + config_env.TASK.SUCCESS_DISTANCE, config_env.TASK.GOAL_SENSOR_UUID + ) + habitat.logger.info(agent_class.__name__) + habitat.logger.info(benchmark.evaluate(agent, num_episodes=100)) + + benchmark._env.close() diff --git a/habitat-lab-dialog/test/test_baseline_trainers.py b/habitat-lab-dialog/test/test_baseline_trainers.py new file mode 100644 index 0000000..837c92f --- /dev/null +++ b/habitat-lab-dialog/test/test_baseline_trainers.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gc +import itertools +import math +import os +import random +from copy import deepcopy +from glob import glob + +import pytest + +from habitat.core.vector_env import VectorEnv + +try: + import torch + import torch.distributed + + from habitat_baselines.common.base_trainer import BaseRLTrainer + from habitat_baselines.common.baseline_registry import baseline_registry + from habitat_baselines.config.default import get_config + from habitat_baselines.run import execute_exp, run_exp + from habitat_baselines.utils.common import batch_obs + + baseline_installed = True +except ImportError: + baseline_installed = False + + +def _powerset(s): + return [ + combo + for r in range(len(s) + 1) + for combo in itertools.combinations(s, r) + ] + + +@pytest.mark.skipif( + not baseline_installed, reason="baseline sub-module not installed" +) +@pytest.mark.parametrize( + "test_cfg_path,mode,gpu2gpu,observation_transforms", + list( + itertools.product( + glob("habitat_baselines/config/test/*"), + ["train", "eval"], + [False], + [ + [], + [ + "CenterCropper", + "ResizeShortestEdge", + ], + ], + ) + ) + + list( + itertools.product( + ["habitat_baselines/config/test/ppo_pointnav_test.yaml"], + ["train", "eval"], + [True], + [ + [], + [ + "CenterCropper", + "ResizeShortestEdge", + ], + ], + ) + ), +) +def test_trainers(test_cfg_path, mode, gpu2gpu, observation_transforms): + # For testing with world_size=1, -1 works as port in PyTorch + os.environ["MASTER_PORT"] = str(-1) + + if gpu2gpu: + try: + import habitat_sim + except ImportError: + pytest.skip("GPU-GPU requires Habitat-Sim") + + if not habitat_sim.cuda_enabled: + pytest.skip("GPU-GPU requires CUDA") + + run_exp( + test_cfg_path, + mode, + [ + "TASK_CONFIG.SIMULATOR.HABITAT_SIM_V0.GPU_GPU", + str(gpu2gpu), + "RL.POLICY.OBS_TRANSFORMS.ENABLED_TRANSFORMS", + str(tuple(observation_transforms)), + ], + ) + + # Needed to destroy the trainer + gc.collect() + + # Deinit processes group + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + +@pytest.mark.skipif( + not baseline_installed, reason="baseline sub-module not installed" +) +@pytest.mark.parametrize( + "test_cfg_path,mode", + [ + [ + "habitat_baselines/config/test/ppo_pointnav_test.yaml", + "train", + ], + ], +) +@pytest.mark.parametrize("camera", ["equirect", "fisheye", "cubemap"]) +@pytest.mark.parametrize("sensor_type", ["RGB", "DEPTH"]) +def test_cubemap_stiching( + test_cfg_path: str, mode: str, camera: str, sensor_type: str +): + meta_config = get_config(config_paths=test_cfg_path) + meta_config.defrost() + config = meta_config.TASK_CONFIG + CAMERA_NUM = 6 + orient = [ + [0, math.pi, 0], # Back + [-math.pi / 2, 0, 0], # Down + [0, 0, 0], # Front + [0, math.pi / 2, 0], # Right + [0, 3 / 2 * math.pi, 0], # Left + [math.pi / 2, 0, 0], # Up + ] + sensor_uuids = [] + + if f"{sensor_type}_SENSOR" not in config.SIMULATOR.AGENT_0.SENSORS: + config.SIMULATOR.AGENT_0.SENSORS.append(f"{sensor_type}_SENSOR") + sensor = getattr(config.SIMULATOR, f"{sensor_type}_SENSOR") + for camera_id in range(CAMERA_NUM): + camera_template = f"{sensor_type}_{camera_id}" + camera_config = deepcopy(sensor) + camera_config.ORIENTATION = orient[camera_id] + camera_config.UUID = camera_template.lower() + sensor_uuids.append(camera_config.UUID) + setattr(config.SIMULATOR, camera_template, camera_config) + config.SIMULATOR.AGENT_0.SENSORS.append(camera_template) + + meta_config.TASK_CONFIG = config + meta_config.SENSORS = config.SIMULATOR.AGENT_0.SENSORS + if camera == "equirect": + meta_config.RL.POLICY.OBS_TRANSFORMS.CUBE2EQ.SENSOR_UUIDS = tuple( + sensor_uuids + ) + elif camera == "fisheye": + meta_config.RL.POLICY.OBS_TRANSFORMS.CUBE2FISH.SENSOR_UUIDS = tuple( + sensor_uuids + ) + meta_config.freeze() + if camera in ["equirect", "fisheye"]: + execute_exp(meta_config, mode) + # Deinit processes group + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + elif camera == "cubemap": + # 1) Generate an equirect image from cubemap images. + # 2) Generate cubemap images from the equirect image. + # 3) Compare the input and output cubemap + env_fn_args = [] + for split in ["train", "val"]: + tmp_config = config.clone() + tmp_config.defrost() + tmp_config.DATASET["SPLIT"] = split + tmp_config.freeze() + env_fn_args.append((tmp_config, None)) + + with VectorEnv(env_fn_args=env_fn_args) as envs: + observations = envs.reset() + batch = batch_obs(observations) + orig_batch = deepcopy(batch) + + # ProjectionTransformer + obs_trans_to_eq = baseline_registry.get_obs_transformer( + "CubeMap2Equirect" + ) + cube2equirect = obs_trans_to_eq(sensor_uuids, (256, 512)) + obs_trans_to_cube = baseline_registry.get_obs_transformer( + "Equirect2CubeMap" + ) + equirect2cube = obs_trans_to_cube( + cube2equirect.target_uuids, (256, 256) + ) + + # Cubemap to Equirect to Cubemap + batch_eq = cube2equirect(batch) + batch_cube = equirect2cube(batch_eq) + + # Extract input and output cubemap + output_cube = batch_cube[cube2equirect.target_uuids[0]] + input_cube = [orig_batch[key] for key in sensor_uuids] + input_cube = torch.stack(input_cube, axis=1) + input_cube = torch.flatten(input_cube, end_dim=1) + + # Apply blur to absorb difference (blur, etc.) caused by conversion + if sensor_type == "RGB": + output_cube = output_cube.float() / 255 + input_cube = input_cube.float() / 255 + output_cube = output_cube.permute((0, 3, 1, 2)) # NHWC => NCHW + input_cube = input_cube.permute((0, 3, 1, 2)) # NHWC => NCHW + apply_blur = torch.nn.AvgPool2d(5, 3, 2) + output_cube = apply_blur(output_cube) + input_cube = apply_blur(input_cube) + + # Calculate the difference + diff = torch.abs(output_cube - input_cube) + assert diff.mean().item() < 0.01 + else: + raise ValueError(f"Unknown camera name: {camera}") + + +@pytest.mark.skipif( + not baseline_installed, reason="baseline sub-module not installed" +) +def test_eval_config(): + ckpt_opts = ["VIDEO_OPTION", "[]"] + eval_opts = ["VIDEO_OPTION", "['disk']"] + + ckpt_cfg = get_config(None, ckpt_opts) + assert ckpt_cfg.VIDEO_OPTION == [] + assert ckpt_cfg.CMD_TRAILING_OPTS == ["VIDEO_OPTION", "[]"] + + eval_cfg = get_config(None, eval_opts) + assert eval_cfg.VIDEO_OPTION == ["disk"] + assert eval_cfg.CMD_TRAILING_OPTS == ["VIDEO_OPTION", "['disk']"] + + trainer = BaseRLTrainer(get_config()) + assert trainer.config.VIDEO_OPTION == ["disk", "tensorboard"] + returned_config = trainer._setup_eval_config(checkpoint_config=ckpt_cfg) + assert returned_config.VIDEO_OPTION == [] + + trainer = BaseRLTrainer(eval_cfg) + returned_config = trainer._setup_eval_config(ckpt_cfg) + assert returned_config.VIDEO_OPTION == ["disk"] + + +def __do_pause_test(num_envs, envs_to_pause): + class PausableShim(VectorEnv): + def __init__(self, num_envs): + self._running = list(range(num_envs)) + + @property + def num_envs(self): + return len(self._running) + + def pause_at(self, idx): + self._running.pop(idx) + + envs = PausableShim(num_envs) + test_recurrent_hidden_states = ( + torch.arange(num_envs).view(num_envs, 1, 1).expand(num_envs, 4, 512) + ) + not_done_masks = torch.arange(num_envs).view(num_envs, 1) + current_episode_reward = torch.arange(num_envs).view(num_envs, 1) + prev_actions = torch.arange(num_envs).view(num_envs, 1) + batch = { + k: torch.arange(num_envs) + .view(num_envs, 1, 1, 1) + .expand(num_envs, 3, 256, 256) + for k in ["a", "b"] + } + rgb_frames = [[idx] for idx in range(num_envs)] + + ( + envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) = BaseRLTrainer._pause_envs( + envs_to_pause, + envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) + + expected = sorted(set(range(num_envs)) - set(envs_to_pause)) + + assert envs._running == expected + + assert list(test_recurrent_hidden_states.size()) == [len(expected), 4, 512] + assert test_recurrent_hidden_states[:, 0, 0].numpy().tolist() == expected + + assert not_done_masks[:, 0].numpy().tolist() == expected + assert current_episode_reward[:, 0].numpy().tolist() == expected + assert prev_actions[:, 0].numpy().tolist() == expected + assert [v[0] for v in rgb_frames] == expected + + for _, v in batch.items(): + assert list(v.size()) == [len(expected), 3, 256, 256] + assert v[:, 0, 0, 0].numpy().tolist() == expected + + +@pytest.mark.skipif( + not baseline_installed, reason="baseline sub-module not installed" +) +def test_pausing(): + random.seed(0) + for _ in range(100): + num_envs = random.randint(1, 13) + envs_to_pause = list(range(num_envs)) + + random.shuffle(envs_to_pause) + envs_to_pause = envs_to_pause[: random.randint(0, num_envs)] + # envs_to_pause is assumed to be sorted in the function + envs_to_pause = sorted(envs_to_pause) + + __do_pause_test(num_envs, envs_to_pause) + + num_envs = 8 + __do_pause_test(num_envs, []) + __do_pause_test(num_envs, list(range(num_envs))) diff --git a/habitat-lab-dialog/test/test_config.py b/habitat-lab-dialog/test/test_config.py new file mode 100644 index 0000000..d9d5b8a --- /dev/null +++ b/habitat-lab-dialog/test/test_config.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from habitat.config.default import get_config + +CFG_TEST = "configs/test/habitat_all_sensors_test.yaml" +CFG_EQA = "configs/test/habitat_mp3d_eqa_test.yaml" +CFG_NEW_KEYS = "configs/test/new_keys_test.yaml" +MAX_TEST_STEPS_LIMIT = 3 + + +def test_merged_configs(): + test_config = get_config(CFG_TEST) + eqa_config = get_config(CFG_EQA) + merged_config = get_config("{},{}".format(CFG_TEST, CFG_EQA)) + assert merged_config.TASK.TYPE == eqa_config.TASK.TYPE + assert ( + merged_config.ENVIRONMENT.MAX_EPISODE_STEPS + == test_config.ENVIRONMENT.MAX_EPISODE_STEPS + ) + + +def test_new_keys_merged_configs(): + test_config = get_config(CFG_TEST) + new_keys_config = get_config(CFG_NEW_KEYS) + merged_config = get_config("{},{}".format(CFG_TEST, CFG_NEW_KEYS)) + assert ( + merged_config.TASK.MY_NEW_TASK_PARAM + == new_keys_config.TASK.MY_NEW_TASK_PARAM + ) + assert ( + merged_config.ENVIRONMENT.MAX_EPISODE_STEPS + == test_config.ENVIRONMENT.MAX_EPISODE_STEPS + ) + + +def test_overwrite_options(): + for steps_limit in range(MAX_TEST_STEPS_LIMIT): + config = get_config( + config_paths=CFG_TEST, + opts=["ENVIRONMENT.MAX_EPISODE_STEPS", steps_limit], + ) + assert ( + config.ENVIRONMENT.MAX_EPISODE_STEPS == steps_limit + ), "Overwriting of config options failed." diff --git a/habitat-lab-dialog/test/test_dataset.py b/habitat-lab-dialog/test/test_dataset.py new file mode 100644 index 0000000..fc02442 --- /dev/null +++ b/habitat-lab-dialog/test/test_dataset.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from itertools import groupby, islice + +import pytest + +from habitat.core.dataset import Dataset, Episode + + +def _construct_dataset(num_episodes, num_groups=10): + episodes = [] + for i in range(num_episodes): + episode = Episode( + episode_id=str(i), + scene_id="scene_id_" + str(i % num_groups), + start_position=[0, 0, 0], + start_rotation=[0, 0, 0, 1], + ) + episodes.append(episode) + dataset = Dataset() + dataset.episodes = episodes + return dataset + + +def test_scene_ids(): + dataset = _construct_dataset(100) + assert dataset.scene_ids == ["scene_id_" + str(ii) for ii in range(10)] + + +def test_get_scene_episodes(): + dataset = _construct_dataset(100) + scene = "scene_id_0" + scene_episodes = dataset.get_scene_episodes(scene) + assert len(scene_episodes) == 10 + for ep in scene_episodes: + assert ep.scene_id == scene + + +def test_filter_episodes(): + dataset = _construct_dataset(100) + + def filter_fn(episode: Episode) -> bool: + return int(episode.episode_id) % 2 == 0 + + filtered_dataset = dataset.filter_episodes(filter_fn) + assert len(filtered_dataset.episodes) == 50 + for ep in filtered_dataset.episodes: + assert filter_fn(ep) + + +def test_get_splits_even_split_possible(): + dataset = _construct_dataset(100) + splits = dataset.get_splits(10) + assert len(splits) == 10 + for split in splits: + assert len(split.episodes) == 10 + + +def test_get_splits_with_remainder(): + dataset = _construct_dataset(100) + splits = dataset.get_splits(11) + assert len(splits) == 11 + for split in splits: + assert len(split.episodes) == 9 + + +def test_get_splits_num_episodes_specified(): + dataset = _construct_dataset(100) + splits = dataset.get_splits(10, 3, False) + assert len(splits) == 10 + for split in splits: + assert len(split.episodes) == 3 + assert len(dataset.episodes) == 100 + + dataset = _construct_dataset(100) + splits = dataset.get_splits(10, 10) + assert len(splits) == 10 + for split in splits: + assert len(split.episodes) == 10 + assert len(dataset.episodes) == 100 + + dataset = _construct_dataset(100) + splits = dataset.get_splits(10, 3, True) + assert len(splits) == 10 + for split in splits: + assert len(split.episodes) == 3 + assert len(dataset.episodes) == 30 + + dataset = _construct_dataset(100) + with pytest.raises(ValueError): + splits = dataset.get_splits(10, 20) + + +def test_get_splits_collate_scenes(): + dataset = _construct_dataset(10000) + splits = dataset.get_splits(10, 23, collate_scene_ids=True) + assert len(splits) == 10 + for split in splits: + assert len(split.episodes) == 23 + prev_ids = set() + for ii, ep in enumerate(split.episodes): + if ep.scene_id not in prev_ids: + prev_ids.add(ep.scene_id) + else: + assert split.episodes[ii - 1].scene_id == ep.scene_id + + dataset = _construct_dataset(10000) + splits = dataset.get_splits(10, 200, collate_scene_ids=False) + assert len(splits) == 10 + for split in splits: + prev_ids = set() + found_not_collated = False + for ii, ep in enumerate(split.episodes): + if ep.scene_id not in prev_ids: + prev_ids.add(ep.scene_id) + else: + if split.episodes[ii - 1].scene_id != ep.scene_id: + found_not_collated = True + break + assert found_not_collated + + dataset = _construct_dataset(10000) + splits = dataset.get_splits(10, collate_scene_ids=True) + assert len(splits) == 10 + for split in splits: + assert len(split.episodes) == 1000 + prev_ids = set() + for ii, ep in enumerate(split.episodes): + if ep.scene_id not in prev_ids: + prev_ids.add(ep.scene_id) + else: + assert split.episodes[ii - 1].scene_id == ep.scene_id + + dataset = _construct_dataset(10000) + splits = dataset.get_splits(10, collate_scene_ids=False) + assert len(splits) == 10 + for split in splits: + prev_ids = set() + found_not_collated = False + for ii, ep in enumerate(split.episodes): + if ep.scene_id not in prev_ids: + prev_ids.add(ep.scene_id) + else: + if split.episodes[ii - 1].scene_id != ep.scene_id: + found_not_collated = True + break + assert found_not_collated + + +def test_get_splits_sort_by_episode_id(): + dataset = _construct_dataset(10000) + splits = dataset.get_splits(10, 23, sort_by_episode_id=True) + assert len(splits) == 10 + for split in splits: + assert len(split.episodes) == 23 + for ii, ep in enumerate(split.episodes): + if ii > 0: + assert ep.episode_id >= split.episodes[ii - 1].episode_id + + +@pytest.mark.parametrize( + "num_episodes,num_splits", + [(994, 64), (1023, 64), (1024, 64), (1025, 64), (10000, 9), (10000, 10)], +) +def test_get_splits_func(num_episodes: int, num_splits: int): + dataset = _construct_dataset(num_episodes) + splits = dataset.get_splits(num_splits, allow_uneven_splits=True) + assert len(splits) == num_splits + assert sum(len(split.episodes) for split in splits) == num_episodes + splits = dataset.get_splits(num_splits, allow_uneven_splits=False) + assert len(splits) == num_splits + assert ( + sum(map(lambda s: s.num_episodes, splits)) + == (num_episodes // num_splits) * num_splits + ) + + +def test_sample_episodes(): + dataset = _construct_dataset(1000) + ep_iter = dataset.get_episode_iterator( + num_episode_sample=1000, cycle=False + ) + assert len(list(ep_iter)) == 1000 + + ep_iter = dataset.get_episode_iterator(num_episode_sample=0, cycle=False) + assert len(list(ep_iter)) == 0 + + with pytest.raises(ValueError): + dataset.get_episode_iterator(num_episode_sample=1001, cycle=False) + + ep_iter = dataset.get_episode_iterator(num_episode_sample=100, cycle=True) + ep_id_list = [e.episode_id for e in list(islice(ep_iter, 100))] + assert len(set(ep_id_list)) == 100 + next_episode = next(ep_iter) + assert next_episode.episode_id in ep_id_list + + ep_iter = dataset.get_episode_iterator(num_episode_sample=0, cycle=False) + with pytest.raises(StopIteration): + next(ep_iter) + + +def test_iterator_cycle(): + dataset = _construct_dataset(100) + ep_iter = dataset.get_episode_iterator( + cycle=True, shuffle=False, group_by_scene=False + ) + for i in range(200): + episode = next(ep_iter) + assert episode.episode_id == dataset.episodes[i % 100].episode_id + + ep_iter = dataset.get_episode_iterator(cycle=True, num_episode_sample=20) + episodes = list(islice(ep_iter, 20)) + for i in range(200): + episode = next(ep_iter) + assert episode.episode_id == episodes[i % 20].episode_id + + +def test_iterator_shuffle(): + dataset = _construct_dataset(100) + episode_iter = dataset.get_episode_iterator(shuffle=True) + first_round_episodes = list(islice(episode_iter, 100)) + second_round_episodes = list(islice(episode_iter, 100)) + + # both rounds should have same episodes but in different order + assert sorted(first_round_episodes) == sorted(second_round_episodes) + assert first_round_episodes != second_round_episodes + + # both rounds should be grouped by scenes + first_round_scene_groups = [ + k for k, g in groupby(first_round_episodes, key=lambda x: x.scene_id) + ] + second_round_scene_groups = [ + k for k, g in groupby(second_round_episodes, key=lambda x: x.scene_id) + ] + assert len(first_round_scene_groups) == len(second_round_scene_groups) + assert len(first_round_scene_groups) == len(set(first_round_scene_groups)) + + +def test_iterator_scene_switching_episodes(): + total_ep = 1000 + max_repeat = 25 + dataset = _construct_dataset(total_ep) + + episode_iter = dataset.get_episode_iterator( + max_scene_repeat_episodes=max_repeat, shuffle=False, cycle=True + ) + episodes = sorted(dataset.episodes, key=lambda x: x.scene_id) + + for _ in range(max_repeat): + episode = next(episode_iter) + assert ( + episode.episode_id == episodes.pop(0).episode_id + ), "episodes before max_repeat reached should be identical" + + episode = next(episode_iter) + assert ( + episode.scene_id != episodes.pop(0).scene_id + ), "After max_repeat episodes a scene switch doesn't happen." + + remaining_episodes = list(islice(episode_iter, total_ep - max_repeat - 1)) + assert len(remaining_episodes) == len( + episodes + ), "Remaining episodes should be identical." + + assert len({e.scene_id for e in remaining_episodes}) == len( + set(map(lambda ep: ep.scene_id, remaining_episodes)) + ), "Next episodes should still include all scenes." + + cycled_episodes = list(islice(episode_iter, 4 * total_ep)) + assert ( + len(set(map(lambda x: x.episode_id, cycled_episodes))) == total_ep + ), "Some episodes leaked after cycling." + + grouped_episodes = [ + list(g) for k, g in groupby(cycled_episodes, key=lambda x: x.scene_id) + ] + assert ( + len(sum(grouped_episodes, [])) == 4 * total_ep + ), "Cycled episode iterator returned unexpected number of episodes." + assert ( + len(grouped_episodes) == 4 * total_ep / max_repeat + ), "The number of scene switches is unexpected." + + assert all( + len(group) == max_repeat for group in grouped_episodes + ), "Not all scene switches are equal to required number." + + +def test_iterator_scene_switching_episodes_without_shuffle_cycle(): + total_ep = 1000 + max_repeat = 25 + dataset = _construct_dataset(total_ep) + episode_iter = dataset.get_episode_iterator( + max_scene_repeat_episodes=max_repeat, shuffle=False, cycle=False + ) + + grouped_episodes = [ + list(g) for k, g in groupby(episode_iter, key=lambda x: x.scene_id) + ] + assert ( + len(sum(grouped_episodes, [])) == total_ep + ), "The episode iterator returned unexpected number of episodes." + assert ( + len(grouped_episodes) == total_ep / max_repeat + ), "The number of scene switches is unexpected." + + assert all( + len(group) == max_repeat for group in grouped_episodes + ), "Not all scene stitches are equal to requirement." + + +def test_iterator_scene_switching_steps(): + total_ep = 1000 + max_repeat_steps = 250 + dataset = _construct_dataset(total_ep) + + episode_iter = dataset.get_episode_iterator( + max_scene_repeat_steps=max_repeat_steps, + shuffle=False, + step_repetition_range=0.0, + ) + episodes = sorted(dataset.episodes, key=lambda x: x.scene_id) + + episode = next(episode_iter) + assert ( + episode.episode_id == episodes.pop(0).episode_id + ), "After max_repeat_steps episodes a scene switch doesn't happen." + + # episodes before max_repeat reached should be identical + for _ in range(max_repeat_steps): + episode_iter.step_taken() + + episode = next(episode_iter) + assert ( + episode.episode_id != episodes.pop(0).episode_id + ), "After max_repeat_steps episodes a scene switch doesn't happen." + + remaining_episodes = list(islice(episode_iter, total_ep - 2)) + assert len(remaining_episodes) == len( + episodes + ), "Remaining episodes numbers aren't equal." + + assert len({e.scene_id for e in remaining_episodes}) == len( + list(groupby(remaining_episodes, lambda ep: ep.scene_id)) + ), ( + "Next episodes should still be grouped by scene (before next " + "switching)." + ) + + +def test_preserve_order(): + dataset = _construct_dataset(100) + episodes = sorted(dataset.episodes, reverse=True, key=lambda x: x.scene_id) + dataset.episodes = episodes[:] + episode_iter = dataset.get_episode_iterator(shuffle=False, cycle=False) + + assert list(episode_iter) == episodes diff --git a/habitat-lab-dialog/test/test_ddppo_reduce.py b/habitat-lab-dialog/test/test_ddppo_reduce.py new file mode 100644 index 0000000..5eae6ff --- /dev/null +++ b/habitat-lab-dialog/test/test_ddppo_reduce.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import pytest + +from habitat.core.spaces import ActionSpace, EmptySpace +from habitat.tasks.nav.nav import IntegratedPointGoalGPSAndCompassSensor + +torch = pytest.importorskip("torch") +habitat_baselines = pytest.importorskip("habitat_baselines") + +import gym +from torch import distributed as distrib +from torch import nn + +from habitat_baselines.common.rollout_storage import RolloutStorage +from habitat_baselines.config.default import get_config +from habitat_baselines.rl.ddppo.algo import DDPPO +from habitat_baselines.rl.ppo.policy import PointNavBaselinePolicy + + +def _worker_fn( + world_rank: int, world_size: int, port: int, unused_params: bool +): + device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) + tcp_store = distrib.TCPStore( # type: ignore + "127.0.0.1", port, world_size, world_rank == 0 + ) + distrib.init_process_group( + "gloo", store=tcp_store, rank=world_rank, world_size=world_size + ) + + config = get_config("habitat_baselines/config/test/ppo_pointnav_test.yaml") + obs_space = gym.spaces.Dict( + { + IntegratedPointGoalGPSAndCompassSensor.cls_uuid: gym.spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=(2,), + dtype=np.float32, + ) + } + ) + action_space = ActionSpace({"move": EmptySpace()}) + actor_critic = PointNavBaselinePolicy.from_config( + config, obs_space, action_space + ) + # This use adds some arbitrary parameters that aren't part of the computation + # graph, so they will mess up DDP if they aren't correctly ignored by it + if unused_params: + actor_critic.unused = nn.Linear(64, 64) + + actor_critic.to(device=device) + ppo_cfg = config.RL.PPO + agent = DDPPO( + actor_critic=actor_critic, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + use_normalized_advantage=ppo_cfg.use_normalized_advantage, + ) + agent.init_distributed() + rollouts = RolloutStorage( + ppo_cfg.num_steps, + 2, + obs_space, + action_space, + ppo_cfg.hidden_size, + num_recurrent_layers=actor_critic.net.num_recurrent_layers, + is_double_buffered=False, + ) + rollouts.to(device) + + for k, v in rollouts.buffers["observations"].items(): + rollouts.buffers["observations"][k] = torch.randn_like(v) + + # Add two steps so batching works + rollouts.advance_rollout() + rollouts.advance_rollout() + + # Get a single batch + batch = next(rollouts.recurrent_generator(rollouts.buffers["returns"], 1)) + + # Call eval actions through the internal wrapper that is used in + # agent.update + value, action_log_probs, dist_entropy, _ = agent._evaluate_actions( + batch["observations"], + batch["recurrent_hidden_states"], + batch["prev_actions"], + batch["masks"], + batch["actions"], + ) + # Backprop on things + (value.mean() + action_log_probs.mean() + dist_entropy.mean()).backward() + + # Make sure all ranks have very similar parameters + for param in actor_critic.parameters(): + if param.grad is not None: + grads = [param.grad.detach().clone() for _ in range(world_size)] + distrib.all_gather(grads, grads[world_rank]) + + for i in range(world_size): + assert torch.isclose(grads[i], grads[world_rank]).all() + + +@pytest.mark.parametrize("unused_params", [True, False]) +def test_ddppo_reduce(unused_params: bool): + world_size = 2 + torch.multiprocessing.spawn( + _worker_fn, + args=(world_size, 8748 + int(unused_params), unused_params), + nprocs=world_size, + ) diff --git a/habitat-lab-dialog/test/test_demo_notebook.py b/habitat-lab-dialog/test/test_demo_notebook.py new file mode 100644 index 0000000..5254eda --- /dev/null +++ b/habitat-lab-dialog/test/test_demo_notebook.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import gc + +import pytest + +import habitat +from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1 + + +def test_demo_notebook(): + config = habitat.get_config("configs/tasks/pointnav_mp3d.yaml") + config.defrost() + config.DATASET.SPLIT = "val" + + if not PointNavDatasetV1.check_config_paths_exist(config.DATASET): + pytest.skip( + "Please download the Matterport3D PointNav val dataset and Matterport3D val scenes" + ) + else: + pytest.main(["--nbval-lax", "notebooks/habitat-lab-demo.ipynb"]) + + # NB: Force a gc collect run as it can take a little bit for + # the cleanup to happen after the notebook and we get + # a double context crash! + gc.collect() diff --git a/habitat-lab-dialog/test/test_examples.py b/habitat-lab-dialog/test/test_examples.py new file mode 100644 index 0000000..8061695 --- /dev/null +++ b/habitat-lab-dialog/test/test_examples.py @@ -0,0 +1,62 @@ +import itertools +import multiprocessing +import runpy +import sys +from os import path as osp + +import pytest + + +def run_main(*args): + # patch sys.args + sys.argv = list(args) + target = args[0] + # run_path has one difference with invoking Python from command-line: + # if the target is a file (rather than a directory), it does not add its + # parent directory to sys.path. Thus, importing other modules from the + # same directory is broken unless sys.path is patched here. + if osp.isfile(target): + sys.path.insert(0, osp.dirname(target)) + runpy.run_path(target, run_name="__main__") + + +def powerset(iterable): + s = list(iterable) + return itertools.chain.from_iterable( + itertools.combinations(s, r) for r in range(len(s) + 1) + ) + + +def run_main_subproc(args): + # This test needs to be done in its own process as there is a potentially for + # an OpenGL context clash otherwise + mp_ctx = multiprocessing.get_context("spawn") + proc = mp_ctx.Process(target=run_main, args=args) + proc.start() + proc.join() + assert proc.exitcode == 0 + + +@pytest.mark.skipif( + not osp.exists( + "data/scene_datasets/habitat-test-scenes/skokloster-castle.glb" + ) + or not osp.exists( + "data/scene_datasets/habitat-test-scenes/van-gogh-room.glb" + ) + or not osp.exists("data/scene_datasets/coda/coda.glb"), + reason="Requires the habitat-test-scenes", +) +@pytest.mark.parametrize( + "args", + [ + ( + "examples/tutorials/nb_python/Habitat_Interactive_Tasks.py", + "--no-show-video", + "--no-make-video", + ), + ("examples/tutorials/nb_python/Habitat_Lab.py",), + ], +) +def test_example_modules(args): + run_main_subproc(args) diff --git a/habitat-lab-dialog/test/test_habitat_env.py b/habitat-lab-dialog/test/test_habitat_env.py new file mode 100644 index 0000000..6f9af6c --- /dev/null +++ b/habitat-lab-dialog/test/test_habitat_env.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import itertools +import multiprocessing as mp +import os + +import numpy as np +import pytest + +import habitat +from habitat.config.default import get_config +from habitat.core.simulator import AgentState +from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1 +from habitat.tasks.nav.nav import NavigationEpisode, NavigationGoal, StopAction +from habitat.utils.test_utils import sample_non_stop_action + +CFG_TEST = "configs/test/habitat_all_sensors_test.yaml" +NUM_ENVS = 4 + + +class DummyRLEnv(habitat.RLEnv): + def __init__(self, config, dataset=None, env_ind=0): + super(DummyRLEnv, self).__init__(config, dataset) + self._env_ind = env_ind + + def get_reward_range(self): + return -1.0, 1.0 + + def get_reward(self, observations): + return 0.0 + + def get_done(self, observations): + done = False + if self._env.episode_over: + done = True + return done + + def get_info(self, observations): + return {} + + def get_env_ind(self): + return self._env_ind + + def set_env_ind(self, new_env_ind): + self._env_ind = new_env_ind + + +def _load_test_data(): + configs = [] + datasets = [] + for _ in range(NUM_ENVS): + config = get_config(CFG_TEST) + if not PointNavDatasetV1.check_config_paths_exist(config.DATASET): + pytest.skip("Please download Habitat test data to data folder.") + + datasets.append( + habitat.make_dataset( + id_dataset=config.DATASET.TYPE, config=config.DATASET + ) + ) + + config.defrost() + config.SIMULATOR.SCENE = datasets[-1].episodes[0].scene_id + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.freeze() + configs.append(config) + + return configs, datasets + + +def _vec_env_test_fn(configs, datasets, multiprocessing_start_method, gpu2gpu): + num_envs = len(configs) + for cfg in configs: + cfg.defrost() + cfg.SIMULATOR.HABITAT_SIM_V0.GPU_GPU = gpu2gpu + cfg.freeze() + + env_fn_args = tuple(zip(configs, datasets, range(num_envs))) + with habitat.VectorEnv( + env_fn_args=env_fn_args, + multiprocessing_start_method=multiprocessing_start_method, + ) as envs: + envs.reset() + + for _ in range(2 * configs[0].ENVIRONMENT.MAX_EPISODE_STEPS): + observations = envs.step( + sample_non_stop_action(envs.action_spaces[0], num_envs) + ) + assert len(observations) == num_envs + + +@pytest.mark.parametrize( + "multiprocessing_start_method,gpu2gpu", + itertools.product(["forkserver", "spawn", "fork"], [True, False]), +) +def test_vectorized_envs(multiprocessing_start_method, gpu2gpu): + import habitat_sim + + if gpu2gpu and not habitat_sim.cuda_enabled: + pytest.skip("GPU-GPU requires CUDA") + + configs, datasets = _load_test_data() + if multiprocessing_start_method == "fork": + if gpu2gpu: + pytest.skip("Fork does not support gpu2gpu") + + # 'fork' works in a process that has yet to use the GPU + # this test uses spawns a new python instance, which allows us to fork + mp_ctx = mp.get_context("spawn") + p = mp_ctx.Process( + target=_vec_env_test_fn, + args=(configs, datasets, multiprocessing_start_method, gpu2gpu), + ) + p.start() + p.join() + assert p.exitcode == 0 + else: + _vec_env_test_fn( + configs, datasets, multiprocessing_start_method, gpu2gpu + ) + + +def test_with_scope(): + configs, datasets = _load_test_data() + num_envs = len(configs) + env_fn_args = tuple(zip(configs, datasets, range(num_envs))) + with habitat.VectorEnv( + env_fn_args=env_fn_args, multiprocessing_start_method="forkserver" + ) as envs: + envs.reset() + + assert envs._is_closed + + +def test_number_of_episodes(): + configs, datasets = _load_test_data() + num_envs = len(configs) + env_fn_args = tuple(zip(configs, datasets, range(num_envs))) + with habitat.VectorEnv( + env_fn_args=env_fn_args, multiprocessing_start_method="forkserver" + ) as envs: + assert envs.number_of_episodes == [10000, 10000, 10000, 10000] + + +def test_threaded_vectorized_env(): + configs, datasets = _load_test_data() + num_envs = len(configs) + env_fn_args = tuple(zip(configs, datasets, range(num_envs))) + with habitat.ThreadedVectorEnv(env_fn_args=env_fn_args) as envs: + envs.reset() + + for _ in range(2 * configs[0].ENVIRONMENT.MAX_EPISODE_STEPS): + observations = envs.step( + sample_non_stop_action(envs.action_spaces[0], num_envs) + ) + assert len(observations) == num_envs + + +@pytest.mark.parametrize("gpu2gpu", [False, True]) +def test_env(gpu2gpu): + import habitat_sim + + if gpu2gpu and not habitat_sim.cuda_enabled: + pytest.skip("GPU-GPU requires CUDA") + + config = get_config(CFG_TEST) + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + + config.defrost() + config.SIMULATOR.HABITAT_SIM_V0.GPU_GPU = gpu2gpu + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + env.episodes = [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=[-3.0133917, 0.04623024, 7.3064547], + start_rotation=[0, 0.163276, 0, 0.98658], + goals=[ + NavigationGoal( + position=[-3.0133917, 0.04623024, 7.3064547] + ) + ], + info={"geodesic_distance": 0.001}, + ) + ] + env.reset() + + for _ in range(config.ENVIRONMENT.MAX_EPISODE_STEPS): + env.step(sample_non_stop_action(env.action_space)) + + # check for steps limit on environment + assert env.episode_over is True, ( + "episode should be over after " "max_episode_steps" + ) + + env.reset() + + env.step(action={"action": StopAction.name}) + # check for STOP action + assert ( + env.episode_over is True + ), "episode should be over after STOP action" + + +def make_rl_env(config, dataset, rank: int = 0): + r"""Constructor for default habitat Env. + :param config: configurations for environment + :param dataset: dataset for environment + :param rank: rank for setting seeds for environment + :return: constructed habitat Env + """ + env = DummyRLEnv(config=config, dataset=dataset) + env.seed(config.SEED + rank) + return env + + +@pytest.mark.parametrize("gpu2gpu", [False, True]) +def test_rl_vectorized_envs(gpu2gpu): + import habitat_sim + + if gpu2gpu and not habitat_sim.cuda_enabled: + pytest.skip("GPU-GPU requires CUDA") + + configs, datasets = _load_test_data() + for config in configs: + config.defrost() + config.SIMULATOR.HABITAT_SIM_V0.GPU_GPU = gpu2gpu + config.freeze() + + num_envs = len(configs) + env_fn_args = tuple(zip(configs, datasets, range(num_envs))) + with habitat.VectorEnv( + make_env_fn=make_rl_env, env_fn_args=env_fn_args + ) as envs: + envs.reset() + + for i in range(2 * configs[0].ENVIRONMENT.MAX_EPISODE_STEPS): + outputs = envs.step( + sample_non_stop_action(envs.action_spaces[0], num_envs) + ) + observations, rewards, dones, infos = [ + list(x) for x in zip(*outputs) + ] + assert len(observations) == num_envs + assert len(rewards) == num_envs + assert len(dones) == num_envs + assert len(infos) == num_envs + + tiled_img = envs.render(mode="rgb_array") + new_height = int(np.ceil(np.sqrt(NUM_ENVS))) + new_width = int(np.ceil(float(NUM_ENVS) / new_height)) + print(f"observations: {observations}") + h, w, c = observations[0]["rgb"].shape + assert tiled_img.shape == ( + h * new_height, + w * new_width, + c, + ), "vector env render is broken" + + if (i + 1) % configs[0].ENVIRONMENT.MAX_EPISODE_STEPS == 0: + assert all( + dones + ), "dones should be true after max_episode steps" + + +@pytest.mark.parametrize("gpu2gpu", [False, True]) +def test_rl_env(gpu2gpu): + import habitat_sim + + if gpu2gpu and not habitat_sim.cuda_enabled: + pytest.skip("GPU-GPU requires CUDA") + + config = get_config(CFG_TEST) + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + + config.defrost() + config.SIMULATOR.HABITAT_SIM_V0.GPU_GPU = gpu2gpu + config.freeze() + + with DummyRLEnv(config=config, dataset=None) as env: + env.episodes = [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=[-3.0133917, 0.04623024, 7.3064547], + start_rotation=[0, 0.163276, 0, 0.98658], + goals=[ + NavigationGoal( + position=[-3.0133917, 0.04623024, 7.3064547] + ) + ], + info={"geodesic_distance": 0.001}, + ) + ] + + done = False + env.reset() + + for _ in range(config.ENVIRONMENT.MAX_EPISODE_STEPS): + observation, reward, done, info = env.step( + action=sample_non_stop_action(env.action_space) + ) + + # check for steps limit on environment + assert done is True, "episodes should be over after max_episode_steps" + + env.reset() + observation, reward, done, info = env.step( + action={"action": StopAction.name} + ) + assert done is True, "done should be true after STOP action" + + +def _make_dummy_env_func(config, dataset, env_id): + return DummyRLEnv(config=config, dataset=dataset, env_ind=env_id) + + +def test_vec_env_call_func(): + configs, datasets = _load_test_data() + num_envs = len(configs) + env_fn_args = tuple(zip(configs, datasets, range(num_envs))) + true_env_ids = list(range(num_envs)) + with habitat.VectorEnv( + make_env_fn=_make_dummy_env_func, + env_fn_args=env_fn_args, + multiprocessing_start_method="forkserver", + ) as envs: + envs.reset() + env_ids = envs.call(["get_env_ind"] * num_envs) + assert env_ids == true_env_ids + + env_id = envs.call_at(1, "get_env_ind") + assert env_id == true_env_ids[1] + + envs.call_at(2, "set_env_ind", {"new_env_ind": 20}) + true_env_ids[2] = 20 + env_ids = envs.call(["get_env_ind"] * num_envs) + assert env_ids == true_env_ids + + envs.call_at(2, "set_env_ind", {"new_env_ind": 2}) + true_env_ids[2] = 2 + env_ids = envs.call(["get_env_ind"] * num_envs) + assert env_ids == true_env_ids + + envs.pause_at(0) + true_env_ids.pop(0) + env_ids = envs.call(["get_env_ind"] * num_envs) + assert env_ids == true_env_ids + + envs.pause_at(0) + true_env_ids.pop(0) + env_ids = envs.call(["get_env_ind"] * num_envs) + assert env_ids == true_env_ids + + envs.resume_all() + env_ids = envs.call(["get_env_ind"] * num_envs) + assert env_ids == list(range(num_envs)) + + +def test_close_with_paused(): + configs, datasets = _load_test_data() + num_envs = len(configs) + env_fn_args = tuple(zip(configs, datasets, range(num_envs))) + with habitat.VectorEnv( + env_fn_args=env_fn_args, multiprocessing_start_method="forkserver" + ) as envs: + envs.reset() + + envs.pause_at(3) + envs.pause_at(0) + + assert envs._is_closed + + +# TODO Bring back this test for the greedy follower +@pytest.mark.skip +def test_action_space_shortest_path(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + + env = habitat.Env(config=config, dataset=None) + + # action space shortest path + source_position = env.sim.sample_navigable_point() + angles = list(range(-180, 180, config.SIMULATOR.TURN_ANGLE)) + angle = np.radians(np.random.choice(angles)) + source_rotation = [0, np.sin(angle / 2), 0, np.cos(angle / 2)] + source = AgentState(source_position, source_rotation) + + reachable_targets = [] + unreachable_targets = [] + while len(reachable_targets) < 5: + position = env.sim.sample_navigable_point() + angles = list(range(-180, 180, config.SIMULATOR.TURN_ANGLE)) + angle = np.radians(np.random.choice(angles)) + rotation = [0, np.sin(angle / 2), 0, np.cos(angle / 2)] + if env.sim.geodesic_distance(source_position, [position]) != np.inf: + reachable_targets.append(AgentState(position, rotation)) + + while len(unreachable_targets) < 3: + position = env.sim.sample_navigable_point() + # Change height of the point to make it unreachable + position[1] = 100 + angles = list(range(-180, 180, config.SIMULATOR.TURN_ANGLE)) + angle = np.radians(np.random.choice(angles)) + rotation = [0, np.sin(angle / 2), 0, np.cos(angle / 2)] + if env.sim.geodesic_distance(source_position, [position]) == np.inf: + unreachable_targets.append(AgentState(position, rotation)) + + targets = reachable_targets + shortest_path1 = env.action_space_shortest_path(source, targets) + assert shortest_path1 != [] + + targets = unreachable_targets + shortest_path2 = env.action_space_shortest_path(source, targets) + assert shortest_path2 == [] + env.close() diff --git a/habitat-lab-dialog/test/test_habitat_example.py b/habitat-lab-dialog/test/test_habitat_example.py new file mode 100644 index 0000000..493c241 --- /dev/null +++ b/habitat-lab-dialog/test/test_habitat_example.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import pytest + +import habitat +from examples import ( + new_actions, + register_new_sensors_and_measures, + shortest_path_follower_example, + visualization_examples, +) +from examples.example import example +from habitat.datasets.pointnav.pointnav_dataset import PointNavDatasetV1 + + +def test_readme_example(): + if not PointNavDatasetV1.check_config_paths_exist( + config=habitat.get_config().DATASET + ): + pytest.skip("Please download Habitat test data to data folder.") + example() + + +def test_visualizations_example(): + if not PointNavDatasetV1.check_config_paths_exist( + config=habitat.get_config().DATASET + ): + pytest.skip("Please download Habitat test data to data folder.") + visualization_examples.main() + + +def test_shortest_path_follower_example(): + if not PointNavDatasetV1.check_config_paths_exist( + config=habitat.get_config().DATASET + ): + pytest.skip("Please download Habitat test data to data folder.") + shortest_path_follower_example.main() + + +def test_register_new_sensors_and_measures(): + if not PointNavDatasetV1.check_config_paths_exist( + config=habitat.get_config().DATASET + ): + pytest.skip("Please download Habitat test data to data folder.") + + register_new_sensors_and_measures.main() + + +def test_new_actions(): + if not PointNavDatasetV1.check_config_paths_exist( + config=habitat.get_config().DATASET + ): + pytest.skip("Please download Habitat test data to data folder.") + + new_actions.main() diff --git a/habitat-lab-dialog/test/test_habitat_sim.py b/habitat-lab-dialog/test/test_habitat_sim.py new file mode 100644 index 0000000..5301717 --- /dev/null +++ b/habitat-lab-dialog/test/test_habitat_sim.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os + +import numpy as np +import pytest + +from habitat.config.default import get_config +from habitat.sims import make_sim +from habitat.sims.habitat_simulator.actions import HabitatSimActions + + +def init_sim(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + return make_sim(config.SIMULATOR.TYPE, config=config.SIMULATOR) + + +def test_sim_trajectory(): + with open("test/data/habitat-sim_trajectory_data.json", "r") as f: + test_trajectory = json.load(f) + with init_sim() as sim: + + sim.reset() + sim.set_agent_state( + position=test_trajectory["positions"][0], + rotation=test_trajectory["rotations"][0], + ) + + # remove last stop action as Sim has no stop action anymore + for i, action in enumerate(test_trajectory["actions"][:-1]): + action = HabitatSimActions[action] + if i > 0: # ignore first step as habitat-sim doesn't update + # agent until then + state = sim.get_agent_state() + assert ( + np.allclose( + np.array( + test_trajectory["positions"][i], dtype=np.float32 + ), + state.position, + ) + is True + ), "mismatch in position " "at step {}".format(i) + assert ( + np.allclose( + np.array( + test_trajectory["rotations"][i], dtype=np.float32 + ), + np.array([*state.rotation.imag, state.rotation.real]), + ) + is True + ), "mismatch in rotation " "at step {}".format(i) + + max_search_radius = 2.0 + dist_to_obs = sim.distance_to_closest_obstacle( + state.position, max_search_radius + ) + assert np.isclose( + dist_to_obs, test_trajectory["distances_to_obstacles"][i] + ) + + assert sim.action_space.contains(action) + + sim.step(action) + + +def test_sim_no_sensors(): + config = get_config() + config.defrost() + config.SIMULATOR.AGENT_0.SENSORS = [] + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + with make_sim(config.SIMULATOR.TYPE, config=config.SIMULATOR) as sim: + sim.reset() + + +def test_sim_geodesic_distance(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + with make_sim(config.SIMULATOR.TYPE, config=config.SIMULATOR) as sim: + sim.reset() + + with open( + os.path.join( + os.path.dirname(__file__), + "data", + "test-sim-geodesic-distance-test-golden.json", + ), + "r", + ) as f: + test_data = json.load(f) + + for test_case in test_data["single_end"]: + assert np.isclose( + sim.geodesic_distance(test_case["start"], test_case["end"]), + test_case["expected"], + ), "Geodesic distance mechanism has been changed" + + for test_case in test_data["multi_end"]: + assert np.isclose( + sim.geodesic_distance(test_case["start"], test_case["ends"]), + test_case["expected"], + ), "Geodesic distance mechanism has been changed" + + assert np.isclose( + sim.geodesic_distance(test_case["start"], test_case["ends"]), + np.min( + [ + sim.geodesic_distance(test_case["start"], end) + for end in test_case["ends"] + ] + ), + ), "Geodesic distance for multi target setup isn't equal to separate single target calls." diff --git a/habitat-lab-dialog/test/test_habitat_task.py b/habitat-lab-dialog/test/test_habitat_task.py new file mode 100644 index 0000000..0ce6bd7 --- /dev/null +++ b/habitat-lab-dialog/test/test_habitat_task.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os + +import numpy as np +import pytest + +import habitat +from habitat.utils.test_utils import sample_non_stop_action + +CFG_TEST = "configs/test/habitat_all_sensors_test.yaml" +TELEPORT_POSITION = np.array([-3.2890449, 0.15067159, 11.124366]) +TELEPORT_ROTATION = np.array([0.92035, 0, -0.39109465, 0]) + + +def test_task_actions(): + config = habitat.get_config(config_paths=CFG_TEST) + config.defrost() + config.TASK.POSSIBLE_ACTIONS = config.TASK.POSSIBLE_ACTIONS + ["TELEPORT"] + config.freeze() + + with habitat.Env(config=config) as env: + env.reset() + action = { + "action": "TELEPORT", + "action_args": { + "position": TELEPORT_POSITION, + "rotation": TELEPORT_ROTATION, + }, + } + assert env.action_space.contains(action) + env.step(action) + agent_state = env.sim.get_agent_state() + assert np.allclose( + np.array(TELEPORT_POSITION, dtype=np.float32), agent_state.position + ), "mismatch in position after teleport" + assert np.allclose( + np.array(TELEPORT_ROTATION, dtype=np.float32), + np.array([*agent_state.rotation.imag, agent_state.rotation.real]), + ), "mismatch in rotation after teleport" + env.step("TURN_RIGHT") + + +def test_task_actions_sampling_for_teleport(): + config = habitat.get_config(config_paths=CFG_TEST) + config.defrost() + config.TASK.POSSIBLE_ACTIONS = config.TASK.POSSIBLE_ACTIONS + ["TELEPORT"] + config.freeze() + + with habitat.Env(config=config) as env: + env.reset() + while not env.episode_over: + action = sample_non_stop_action(env.action_space) + assert env.action_space.contains(action) + habitat.logger.info( + f"Action : " + f"{action['action']}, " + f"args: {action['action_args']}." + ) + env.step(action) + agent_state = env.sim.get_agent_state() + habitat.logger.info(agent_state) + + +@pytest.mark.parametrize( + "config_file", + [ + CFG_TEST, + "configs/tasks/pointnav.yaml", + "configs/test/habitat_mp3d_eqa_test.yaml", + ], +) +def test_task_actions_sampling(config_file): + config = habitat.get_config(config_paths=config_file) + if not os.path.exists( + config.DATASET.DATA_PATH.format(split=config.DATASET.SPLIT) + ): + pytest.skip( + f"Please download dataset to data folder " + f"{config.DATASET.DATA_PATH}." + ) + + with habitat.Env(config=config) as env: + env.reset() + while not env.episode_over: + action = sample_non_stop_action(env.action_space) + assert env.action_space.contains(action) + habitat.logger.info( + f"Action : " + f"{action['action']}, " + f"args: {action['action_args']}." + ) + env.step(action) + agent_state = env.sim.get_agent_state() + habitat.logger.info(agent_state) diff --git a/habitat-lab-dialog/test/test_install.py b/habitat-lab-dialog/test/test_install.py new file mode 100644 index 0000000..babfa59 --- /dev/null +++ b/habitat-lab-dialog/test/test_install.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import habitat +from habitat.core.logging import logger + + +def test_habitat_install(): + r"""dummy test for testing installation""" + logger.info(str(habitat)) diff --git a/habitat-lab-dialog/test/test_mp3d_eqa.py b/habitat-lab-dialog/test/test_mp3d_eqa.py new file mode 100644 index 0000000..064245f --- /dev/null +++ b/habitat-lab-dialog/test/test_mp3d_eqa.py @@ -0,0 +1,323 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import time + +import numpy as np +import pytest + +import habitat +from habitat.config.default import get_config +from habitat.core.embodied_task import Episode +from habitat.core.logging import logger +from habitat.datasets import make_dataset +from habitat.datasets.eqa import mp3d_eqa_dataset as mp3d_dataset +from habitat.tasks.eqa.eqa import AnswerAction +from habitat.tasks.nav.nav import MoveForwardAction +from habitat.utils.geometry_utils import ( + angle_between_quaternions, + quaternion_from_coeff, +) +from habitat.utils.test_utils import sample_non_stop_action + +CFG_TEST = "configs/test/habitat_mp3d_eqa_test.yaml" +CLOSE_STEP_THRESHOLD = 0.028 +OLD_STOP_ACTION_ID = 3 + + +# List of episodes each from unique house +TEST_EPISODE_SET = [1, 309, 807, 958, 696, 10, 297, 1021, 1307, 1569] + +RGB_EPISODE_MEANS = { + 1: 123.1576333222566, + 10: 123.86094605688947, + 297: 122.69351220853402, + 309: 118.95794969775298, + 696: 115.71903709129052, + 807: 143.7834237211494, + 958: 141.97871610030387, + 1021: 119.1051016229882, + 1307: 102.11408987112925, + 1569: 91.01973929495183, +} + +EPISODES_LIMIT = 6 + + +def get_minos_for_sim_eqa_config(): + _sim_eqa_c = get_config(CFG_TEST) + _sim_eqa_c.task_name = "EQA-v0" + _sim_eqa_c.dataset = mp3d_dataset.get_default_mp3d_v1_config() + _sim_eqa_c.dataset.split = "val" + _sim_eqa_c.scene = "data/scene_datasets/mp3d/17DRP5sb8fy/17DRP5sb8fy.glb" + _sim_eqa_c.height = 512 + _sim_eqa_c.width = 512 + _sim_eqa_c.hfov = "45" + _sim_eqa_c.vfov = "45" + _sim_eqa_c.sensor_position = [0, 1.09, 0] + _sim_eqa_c.forward_step_size = 0.1 # in metres + _sim_eqa_c.turn_angle = 9 # in degrees + _sim_eqa_c.sim = "Sim-v0" + + # Agent configuration + agent_c = _sim_eqa_c.agents[0] + agent_c.height = 1.5 + agent_c.radius = 0.1 + agent_c.mass = 32.0 + agent_c.linear_acceleration = 10.0 + agent_c.angular_acceleration = 5 * 3.14 + agent_c.linear_friction = 1.0 + agent_c.angular_friction = 1.0 + agent_c.coefficient_of_restitution = 0.15707963267 + + return _sim_eqa_c + + +def check_json_serializaiton(dataset: habitat.Dataset): + start_time = time.time() + json_str = str(dataset.to_json()) + logger.info( + "JSON conversion finished. {} sec".format((time.time() - start_time)) + ) + decoded_dataset = dataset.__class__() + decoded_dataset.from_json(json_str) + assert len(decoded_dataset.episodes) > 0 + episode = decoded_dataset.episodes[0] + assert isinstance(episode, Episode) + assert ( + decoded_dataset.to_json() == json_str + ), "JSON dataset encoding/decoding isn't consistent" + + +def test_mp3d_eqa_dataset(): + dataset_config = get_config(CFG_TEST).DATASET + if not mp3d_dataset.Matterport3dDatasetV1.check_config_paths_exist( + dataset_config + ): + pytest.skip("Please download Matterport3D EQA dataset to data folder.") + + dataset = mp3d_dataset.Matterport3dDatasetV1(config=dataset_config) + assert dataset + assert ( + len(dataset.episodes) == mp3d_dataset.EQA_MP3D_V1_VAL_EPISODE_COUNT + ), "Test split episode number mismatch" + check_json_serializaiton(dataset) + + +@pytest.mark.parametrize("split", ["train", "val"]) +def test_dataset_splitting(split): + + dataset_config = get_config(CFG_TEST).DATASET + dataset_config.defrost() + dataset_config.SPLIT = split + if not mp3d_dataset.Matterport3dDatasetV1.check_config_paths_exist( + dataset_config + ): + pytest.skip("Please download Matterport3D EQA dataset to data folder.") + + scenes = mp3d_dataset.Matterport3dDatasetV1.get_scenes_to_load( + config=dataset_config + ) + assert ( + len(scenes) > 0 + ), "Expected dataset contains separate episode file per scene." + + dataset_config.CONTENT_SCENES = scenes + full_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + full_episodes = { + (ep.scene_id, ep.episode_id) for ep in full_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[0 : len(scenes) // 2] + split1_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split1_episodes = { + (ep.scene_id, ep.episode_id) for ep in split1_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[len(scenes) // 2 :] + split2_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split2_episodes = { + (ep.scene_id, ep.episode_id) for ep in split2_dataset.episodes + } + + assert full_episodes == split1_episodes.union( + split2_episodes + ), "Split dataset is not equal to full dataset" + assert ( + len(split1_episodes.intersection(split2_episodes)) == 0 + ), "Intersection of split datasets is not the empty set" + + +def test_mp3d_eqa_sim(): + eqa_config = get_config(CFG_TEST) + + if not mp3d_dataset.Matterport3dDatasetV1.check_config_paths_exist( + eqa_config.DATASET + ): + pytest.skip("Please download Matterport3D EQA dataset to data folder.") + + dataset = make_dataset( + id_dataset=eqa_config.DATASET.TYPE, config=eqa_config.DATASET + ) + with habitat.Env(config=eqa_config, dataset=dataset) as env: + env.episodes = dataset.episodes[:EPISODES_LIMIT] + + env.reset() + while not env.episode_over: + obs = env.step(env.task.action_space.sample()) + if not env.episode_over: + assert "rgb" in obs, "RGB image is missing in observation." + assert obs["rgb"].shape[:2] == ( + eqa_config.SIMULATOR.RGB_SENSOR.HEIGHT, + eqa_config.SIMULATOR.RGB_SENSOR.WIDTH, + ), "Observation resolution {} doesn't correspond to config " "({}, {}).".format( + obs["rgb"].shape[:2], + eqa_config.SIMULATOR.RGB_SENSOR.HEIGHT, + eqa_config.SIMULATOR.RGB_SENSOR.WIDTH, + ) + + +def test_mp3d_eqa_sim_correspondence(): + eqa_config = get_config(CFG_TEST) + + if not mp3d_dataset.Matterport3dDatasetV1.check_config_paths_exist( + eqa_config.DATASET + ): + pytest.skip("Please download Matterport3D EQA dataset to data folder.") + + dataset = make_dataset( + id_dataset=eqa_config.DATASET.TYPE, config=eqa_config.DATASET + ) + with habitat.Env(config=eqa_config, dataset=dataset) as env: + env.episodes = [ + episode + for episode in dataset.episodes + if int(episode.episode_id) in TEST_EPISODE_SET[:EPISODES_LIMIT] + ] + + ep_i = 0 + cycles_n = 2 + while cycles_n > 0: + env.reset() + episode = env.current_episode + assert ( + len(episode.goals) == 1 + ), "Episode has no goals or more than one." + assert ( + len(episode.shortest_paths) == 1 + ), "Episode has no shortest paths or more than one." + start_state = env.sim.get_agent_state() + assert np.allclose( + start_state.position, episode.start_position + ), "Agent's start position diverges from the shortest path's one." + + rgb_mean = 0 + logger.info( + "{id} {question}\n{answer}".format( + id=episode.episode_id, + question=episode.question.question_text, + answer=episode.question.answer_text, + ) + ) + + for step_id, point in enumerate(episode.shortest_paths[0]): + cur_state = env.sim.get_agent_state() + + logger.info( + "diff position: {} diff rotation: {} " + "cur_state.position: {} shortest_path.position: {} " + "cur_state.rotation: {} shortest_path.rotation: {} action: {}" + "".format( + cur_state.position - point.position, + angle_between_quaternions( + cur_state.rotation, + quaternion_from_coeff(point.rotation), + ), + cur_state.position, + point.position, + cur_state.rotation, + point.rotation, + point.action, + ) + ) + + assert np.allclose( + [cur_state.position[0], cur_state.position[2]], + [point.position[0], point.position[2]], + atol=CLOSE_STEP_THRESHOLD * (step_id + 1), + ), "Agent's path diverges from the shortest path." + + if point.action != OLD_STOP_ACTION_ID: + obs = env.step(action=point.action) + + if not env.episode_over: + rgb_mean += obs["rgb"][:, :, :3].mean() + + if ep_i < len(RGB_EPISODE_MEANS): + # Slightly bigger atol for basis meshes + rgb_mean = rgb_mean / len(episode.shortest_paths[0]) + assert np.isclose( + RGB_EPISODE_MEANS[int(episode.episode_id)], + rgb_mean, + atol=0.5, + ), "RGB output doesn't match the ground truth." + + ep_i = (ep_i + 1) % EPISODES_LIMIT + if ep_i == 0: + cycles_n -= 1 + + +def test_eqa_task(): + eqa_config = get_config(CFG_TEST) + + if not mp3d_dataset.Matterport3dDatasetV1.check_config_paths_exist( + eqa_config.DATASET + ): + pytest.skip("Please download Matterport3D EQA dataset to data folder.") + + dataset = make_dataset( + id_dataset=eqa_config.DATASET.TYPE, config=eqa_config.DATASET + ) + with habitat.Env(config=eqa_config, dataset=dataset) as env: + env.episodes = list( + filter( + lambda e: int(e.episode_id) + in TEST_EPISODE_SET[:EPISODES_LIMIT], + dataset.episodes, + ) + ) + + env.reset() + + for _ in range(10): + action = sample_non_stop_action(env.action_space) + if action["action"] != AnswerAction.name: + env.step(action) + metrics = env.get_metrics() + del metrics["episode_info"] + logger.info(metrics) + + correct_answer_id = env.current_episode.question.answer_token + env.step( + { + "action": AnswerAction.name, + "action_args": {"answer_id": correct_answer_id}, + } + ) + + metrics = env.get_metrics() + del metrics["episode_info"] + logger.info(metrics) + assert metrics["answer_accuracy"] == 1 + + with pytest.raises(AssertionError): + env.step({"action": MoveForwardAction.name}) diff --git a/habitat-lab-dialog/test/test_object_nav_task.py b/habitat-lab-dialog/test/test_object_nav_task.py new file mode 100644 index 0000000..08b64b5 --- /dev/null +++ b/habitat-lab-dialog/test/test_object_nav_task.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import time + +import pytest + +import habitat +from habitat.config.default import get_config +from habitat.core.embodied_task import Episode +from habitat.core.logging import logger +from habitat.datasets import make_dataset +from habitat.datasets.object_nav.object_nav_dataset import ObjectNavDatasetV1 +from habitat.tasks.nav.nav import MoveForwardAction + +CFG_TEST = "configs/test/habitat_mp3d_object_nav_test.yaml" +EPISODES_LIMIT = 6 +PARTIAL_LOAD_SCENES = 3 + + +def check_json_serializaiton(dataset: habitat.Dataset): + start_time = time.time() + json_str = dataset.to_json() + logger.info( + "JSON conversion finished. {} sec".format((time.time() - start_time)) + ) + decoded_dataset = ObjectNavDatasetV1() + decoded_dataset.from_json(json_str) + assert len(decoded_dataset.episodes) == len(dataset.episodes) + episode = decoded_dataset.episodes[0] + assert isinstance(episode, Episode) + + # The strings won't match exactly as dictionaries don't have an order for the keys + # Thus we need to parse the json strings and compare the serialized forms + assert json.loads(decoded_dataset.to_json()) == json.loads( + json_str + ), "JSON dataset encoding/decoding isn't consistent" + + +def test_mp3d_object_nav_dataset(): + dataset_config = get_config(CFG_TEST).DATASET + if not ObjectNavDatasetV1.check_config_paths_exist(dataset_config): + pytest.skip( + "Please download Matterport3D ObjectNav Dataset to data folder." + ) + + dataset = habitat.make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + assert dataset + dataset.episodes = dataset.episodes[0:EPISODES_LIMIT] + dataset.goals_by_category = { + k: v + for k, v in dataset.goals_by_category.items() + if k in (ep.goals_key for ep in dataset.episodes) + } + check_json_serializaiton(dataset) + + +@pytest.mark.parametrize("split", ["train", "val"]) +def test_dataset_splitting(split): + dataset_config = get_config(CFG_TEST).DATASET + dataset_config.defrost() + dataset_config.SPLIT = split + + if not ObjectNavDatasetV1.check_config_paths_exist(dataset_config): + pytest.skip("Test skipped as dataset files are missing.") + + scenes = ObjectNavDatasetV1.get_scenes_to_load(config=dataset_config) + assert ( + len(scenes) > 0 + ), "Expected dataset contains separate episode file per scene." + + dataset_config.CONTENT_SCENES = scenes[:PARTIAL_LOAD_SCENES] + full_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + full_episodes = { + (ep.scene_id, ep.episode_id) for ep in full_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[: PARTIAL_LOAD_SCENES // 2] + split1_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split1_episodes = { + (ep.scene_id, ep.episode_id) for ep in split1_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[ + PARTIAL_LOAD_SCENES // 2 : PARTIAL_LOAD_SCENES + ] + split2_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split2_episodes = { + (ep.scene_id, ep.episode_id) for ep in split2_dataset.episodes + } + + assert full_episodes == split1_episodes.union( + split2_episodes + ), "Split dataset is not equal to full dataset" + assert ( + len(split1_episodes.intersection(split2_episodes)) == 0 + ), "Intersection of split datasets is not the empty set" + + +def test_object_nav_task(): + config = get_config(CFG_TEST) + + if not ObjectNavDatasetV1.check_config_paths_exist(config.DATASET): + pytest.skip( + "Please download Matterport3D scene and ObjectNav Datasets to data folder." + ) + + dataset = make_dataset( + id_dataset=config.DATASET.TYPE, config=config.DATASET + ) + with habitat.Env(config=config, dataset=dataset) as env: + for _ in range(10): + env.reset() + while not env.episode_over: + action = env.action_space.sample() + habitat.logger.info( + f"Action : " + f"{action['action']}, " + f"args: {action['action_args']}." + ) + env.step(action) + + metrics = env.get_metrics() + logger.info(metrics) + + with pytest.raises(AssertionError): + env.step({"action": MoveForwardAction.name}) diff --git a/habitat-lab-dialog/test/test_pointnav_dataset.py b/habitat-lab-dialog/test/test_pointnav_dataset.py new file mode 100644 index 0000000..8987354 --- /dev/null +++ b/habitat-lab-dialog/test/test_pointnav_dataset.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import random +import time + +import numpy as np +import pytest + +import habitat +from habitat.config.default import get_config +from habitat.core.embodied_task import Episode +from habitat.core.logging import logger +from habitat.datasets import make_dataset +from habitat.datasets.pointnav import pointnav_generator as pointnav_generator +from habitat.datasets.pointnav.pointnav_dataset import ( + DEFAULT_SCENE_PATH_PREFIX, + PointNavDatasetV1, +) +from habitat.utils.geometry_utils import ( + angle_between_quaternions, + quaternion_from_coeff, +) + +CFG_TEST = "configs/test/habitat_all_sensors_test.yaml" +CFG_MULTI_TEST = "configs/datasets/pointnav/gibson.yaml" +PARTIAL_LOAD_SCENES = 3 +NUM_EPISODES = 10 + + +def check_json_serializaiton(dataset: habitat.Dataset): + start_time = time.time() + json_str = str(dataset.to_json()) + logger.info( + "JSON conversion finished. {} sec".format((time.time() - start_time)) + ) + decoded_dataset = dataset.__class__() + decoded_dataset.from_json(json_str) + assert len(decoded_dataset.episodes) > 0 + episode = decoded_dataset.episodes[0] + assert isinstance(episode, Episode) + assert ( + decoded_dataset.to_json() == json_str + ), "JSON dataset encoding/decoding isn't consistent" + + +def test_single_pointnav_dataset(): + dataset_config = get_config().DATASET + if not PointNavDatasetV1.check_config_paths_exist(dataset_config): + pytest.skip("Test skipped as dataset files are missing.") + scenes = PointNavDatasetV1.get_scenes_to_load(config=dataset_config) + assert ( + len(scenes) > 0 + ), "Expected dataset contains separate episode file per scene." + dataset = PointNavDatasetV1(config=dataset_config) + assert len(dataset.episodes) > 0, "The dataset shouldn't be empty." + assert ( + len(dataset.scene_ids) == 2 + ), "The test dataset scenes number is wrong." + check_json_serializaiton(dataset) + + +def test_multiple_files_scene_path(): + dataset_config = get_config(CFG_MULTI_TEST).DATASET + if not PointNavDatasetV1.check_config_paths_exist(dataset_config): + pytest.skip("Test skipped as dataset files are missing.") + scenes = PointNavDatasetV1.get_scenes_to_load(config=dataset_config) + assert ( + len(scenes) > 0 + ), "Expected dataset contains separate episode file per scene." + dataset_config.defrost() + dataset_config.CONTENT_SCENES = scenes[:PARTIAL_LOAD_SCENES] + dataset_config.SCENES_DIR = os.path.join( + os.getcwd(), DEFAULT_SCENE_PATH_PREFIX + ) + dataset_config.freeze() + partial_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + assert ( + len(partial_dataset.scene_ids) == PARTIAL_LOAD_SCENES + ), "Number of loaded scenes doesn't correspond." + print(partial_dataset.episodes[0].scene_id) + assert os.path.exists( + partial_dataset.episodes[0].scene_id + ), "Scene file {} doesn't exist using absolute path".format( + partial_dataset.episodes[0].scene_id + ) + + +def test_multiple_files_pointnav_dataset(): + dataset_config = get_config(CFG_MULTI_TEST).DATASET + if not PointNavDatasetV1.check_config_paths_exist(dataset_config): + pytest.skip("Test skipped as dataset files are missing.") + scenes = PointNavDatasetV1.get_scenes_to_load(config=dataset_config) + assert ( + len(scenes) > 0 + ), "Expected dataset contains separate episode file per scene." + dataset_config.defrost() + dataset_config.CONTENT_SCENES = scenes[:PARTIAL_LOAD_SCENES] + dataset_config.freeze() + partial_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + assert ( + len(partial_dataset.scene_ids) == PARTIAL_LOAD_SCENES + ), "Number of loaded scenes doesn't correspond." + check_json_serializaiton(partial_dataset) + + +@pytest.mark.parametrize("split", ["train", "val"]) +def test_dataset_splitting(split): + dataset_config = get_config(CFG_MULTI_TEST).DATASET + dataset_config.defrost() + dataset_config.SPLIT = split + + if not PointNavDatasetV1.check_config_paths_exist(dataset_config): + pytest.skip("Test skipped as dataset files are missing.") + + scenes = PointNavDatasetV1.get_scenes_to_load(config=dataset_config) + assert ( + len(scenes) > 0 + ), "Expected dataset contains separate episode file per scene." + + dataset_config.CONTENT_SCENES = scenes[:PARTIAL_LOAD_SCENES] + full_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + full_episodes = { + (ep.scene_id, ep.episode_id) for ep in full_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[: PARTIAL_LOAD_SCENES // 2] + split1_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split1_episodes = { + (ep.scene_id, ep.episode_id) for ep in split1_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[ + PARTIAL_LOAD_SCENES // 2 : PARTIAL_LOAD_SCENES + ] + split2_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split2_episodes = { + (ep.scene_id, ep.episode_id) for ep in split2_dataset.episodes + } + + assert full_episodes == split1_episodes.union( + split2_episodes + ), "Split dataset is not equal to full dataset" + assert ( + len(split1_episodes.intersection(split2_episodes)) == 0 + ), "Intersection of split datasets is not the empty set" + + +def check_shortest_path(env, episode): + def check_state(agent_state, position, rotation): + assert ( + angle_between_quaternions( + agent_state.rotation, quaternion_from_coeff(rotation) + ) + < 1e-5 + ), "Agent's rotation diverges from the shortest path." + + assert np.allclose( + agent_state.position, position + ), "Agent's position position diverges from the shortest path's one." + + assert len(episode.goals) == 1, "Episode has no goals or more than one." + assert ( + len(episode.shortest_paths) == 1 + ), "Episode has no shortest paths or more than one." + + env.episode_iterator = iter([episode]) + env.reset() + start_state = env.sim.get_agent_state() + check_state(start_state, episode.start_position, episode.start_rotation) + + for point in episode.shortest_paths[0]: + cur_state = env.sim.get_agent_state() + check_state(cur_state, point.position, point.rotation) + env.step(point.action) + + +def test_pointnav_episode_generator(): + config = get_config(CFG_TEST) + config.defrost() + config.DATASET.SPLIT = "val" + config.ENVIRONMENT.MAX_EPISODE_STEPS = 500 + config.freeze() + if not PointNavDatasetV1.check_config_paths_exist(config.DATASET): + pytest.skip("Test skipped as dataset files are missing.") + with habitat.Env(config) as env: + env.seed(config.SEED) + random.seed(config.SEED) + generator = pointnav_generator.generate_pointnav_episode( + sim=env.sim, + shortest_path_success_distance=config.TASK.SUCCESS_DISTANCE, + shortest_path_max_steps=config.ENVIRONMENT.MAX_EPISODE_STEPS, + ) + episodes = [] + for _ in range(NUM_EPISODES): + episode = next(generator) + episodes.append(episode) + + for episode in pointnav_generator.generate_pointnav_episode( + sim=env.sim, + num_episodes=NUM_EPISODES, + shortest_path_success_distance=config.TASK.SUCCESS_DISTANCE, + shortest_path_max_steps=config.ENVIRONMENT.MAX_EPISODE_STEPS, + geodesic_to_euclid_min_ratio=0, + ): + episodes.append(episode) + + assert len(episodes) == 2 * NUM_EPISODES + env.episode_iterator = iter(episodes) + + for episode in episodes: + check_shortest_path(env, episode) + + dataset = habitat.Dataset() + dataset.episodes = episodes + assert ( + dataset.to_json() + ), "Generated episodes aren't json serializable." diff --git a/habitat-lab-dialog/test/test_pyrobot.py b/habitat-lab-dialog/test/test_pyrobot.py new file mode 100644 index 0000000..79fb28a --- /dev/null +++ b/habitat-lab-dialog/test/test_pyrobot.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import sys + +import mock +import numpy as np + +from habitat.config.default import get_config +from habitat.sims import make_sim + + +class CameraMock: + def get_rgb(self): + return np.zeros((256, 256, 3)) + + def get_depth(self): + return np.zeros((256, 256, 1)) + + def reset(self): + pass + + def step(self, *args, **kwargs): + pass + + +class RobotMock: + def __init__(self, *args, **kwargs): + self.camera = CameraMock() + self.base = BaseMock() + + +class BaseMock: + def __init__(self, *args, **kwargs): + self.base_state = mock.MagicMock() + self.base_state.bumper = False + + def go_to_relative(self, *args, **kwargs): + pass + + +def test_pyrobot(mocker): + if "pyrobot" not in sys.modules: + # Mock pyrobot package if it is not installed + mock_pyrobot = mocker.MagicMock() + mock_pyrobot.Robot = RobotMock + sys.modules["pyrobot"] = mock_pyrobot + + # Re-register pyrobot with mock + from habitat.sims.registration import _try_register_pyrobot + + _try_register_pyrobot() + + config = get_config() + with make_sim("PyRobot-v0", config=config.PYROBOT) as reality: + + _ = reality.reset() + _ = reality.step( + "go_to_relative", + { + "xyt_position": [0, 0, (10 / 180) * np.pi], + "use_map": False, + "close_loop": True, + "smooth": False, + }, + ) diff --git a/habitat-lab-dialog/test/test_r2r_vln.py b/habitat-lab-dialog/test/test_r2r_vln.py new file mode 100644 index 0000000..63fe864 --- /dev/null +++ b/habitat-lab-dialog/test/test_r2r_vln.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import time + +import pytest + +import habitat +from habitat.config.default import get_config +from habitat.core.logging import logger +from habitat.datasets import make_dataset +from habitat.datasets.vln import r2r_vln_dataset as r2r_vln_dataset +from habitat.tasks.nav.shortest_path_follower import ShortestPathFollower +from habitat.tasks.vln.vln import VLNEpisode + +CFG_TEST = "configs/test/habitat_r2r_vln_test.yaml" +R2R_VAL_SEEN_EPISODES = 778 +EPISODES_LIMIT = 1 + + +def check_json_serializaiton(dataset: habitat.Dataset): + start_time = time.time() + json_str = str(dataset.to_json()) + logger.info( + "JSON conversion finished. {} sec".format((time.time() - start_time)) + ) + decoded_dataset = dataset.__class__() + decoded_dataset.from_json(json_str) + assert len(decoded_dataset.episodes) > 0 + episode = decoded_dataset.episodes[0] + assert isinstance(episode, VLNEpisode) + assert ( + decoded_dataset.to_json() == json_str + ), "JSON dataset encoding/decoding isn't consistent" + + +def test_r2r_vln_dataset(): + vln_config = get_config(CFG_TEST) + if not r2r_vln_dataset.VLNDatasetV1.check_config_paths_exist( + vln_config.DATASET + ): + pytest.skip("Please download Matterport3D R2R dataset to data folder.") + + dataset = make_dataset( + id_dataset=vln_config.DATASET.TYPE, config=vln_config.DATASET + ) + assert dataset + assert ( + len(dataset.episodes) == R2R_VAL_SEEN_EPISODES + ), "Val Seen split episode number mismatch" + + check_json_serializaiton(dataset) + + +@pytest.mark.parametrize("split", ["train", "val_seen", "val_unseen"]) +def test_dataset_splitting(split): + dataset_config = get_config(CFG_TEST).DATASET + dataset_config.defrost() + dataset_config.SPLIT = split + + if not r2r_vln_dataset.VLNDatasetV1.check_config_paths_exist( + dataset_config + ): + pytest.skip("Please download Matterport3D R2R dataset to data folder.") + + scenes = r2r_vln_dataset.VLNDatasetV1.get_scenes_to_load( + config=dataset_config + ) + assert ( + len(scenes) > 0 + ), "Expected dataset contains separate episode file per scene." + + dataset_config.CONTENT_SCENES = scenes + full_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + full_episodes = { + (ep.scene_id, ep.episode_id) for ep in full_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[0 : len(scenes) // 2] + split1_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split1_episodes = { + (ep.scene_id, ep.episode_id) for ep in split1_dataset.episodes + } + + dataset_config.CONTENT_SCENES = scenes[len(scenes) // 2 :] + split2_dataset = make_dataset( + id_dataset=dataset_config.TYPE, config=dataset_config + ) + split2_episodes = { + (ep.scene_id, ep.episode_id) for ep in split2_dataset.episodes + } + + assert full_episodes == split1_episodes.union( + split2_episodes + ), "Split dataset is not equal to full dataset" + assert ( + len(split1_episodes.intersection(split2_episodes)) == 0 + ), "Intersection of split datasets is not the empty set" + + +def test_r2r_vln_sim(): + vln_config = get_config(CFG_TEST) + + if not r2r_vln_dataset.VLNDatasetV1.check_config_paths_exist( + vln_config.DATASET + ): + pytest.skip( + "Please download Matterport3D R2R VLN dataset to data folder." + ) + + dataset = make_dataset( + id_dataset=vln_config.DATASET.TYPE, config=vln_config.DATASET + ) + + with habitat.Env(config=vln_config, dataset=dataset) as env: + env.episodes = dataset.episodes[:EPISODES_LIMIT] + + follower = ShortestPathFollower( + env.sim, goal_radius=0.5, return_one_hot=False + ) + + for _ in range(len(env.episodes)): + env.reset() + path = env.current_episode.reference_path + [ + env.current_episode.goals[0].position + ] + for point in path: + while env.episode_over: + best_action = follower.get_next_action(point) + + obs = env.step(best_action) + assert "rgb" in obs, "RGB image is missing in observation." + assert ( + "instruction" in obs + ), "Instruction is missing in observation." + assert ( + obs["instruction"]["text"] + == env.current_episode.instruction.instruction_text + ), "Instruction from sensor does not match the intruction from the episode" + + assert obs["rgb"].shape[:2] == ( + vln_config.SIMULATOR.RGB_SENSOR.HEIGHT, + vln_config.SIMULATOR.RGB_SENSOR.WIDTH, + ), "Observation resolution {} doesn't correspond to config " "({}, {}).".format( + obs["rgb"].shape[:2], + vln_config.SIMULATOR.RGB_SENSOR.HEIGHT, + vln_config.SIMULATOR.RGB_SENSOR.WIDTH, + ) diff --git a/habitat-lab-dialog/test/test_rnn_state_encoder.py b/habitat-lab-dialog/test/test_rnn_state_encoder.py new file mode 100644 index 0000000..8685c61 --- /dev/null +++ b/habitat-lab-dialog/test/test_rnn_state_encoder.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import pytest + +try: + import torch +except ImportError: + torch = None + + +@pytest.mark.skipif(torch is None, reason="Test requires pytorch") +def test_rnn_state_encoder(): + from habitat_baselines.rl.models.rnn_state_encoder import ( + build_rnn_state_encoder, + ) + + device = ( + torch.device("cuda") + if torch.cuda.is_available() + else torch.device("cpu") + ) + rnn_state_encoder = build_rnn_state_encoder(32, 32, num_layers=2).to( + device=device + ) + rnn = rnn_state_encoder.rnn + with torch.no_grad(): + for T in [1, 2, 4, 8, 16, 32, 64, 3, 13, 31]: + for N in [1, 2, 4, 8, 3, 5]: + masks = torch.randint( + 0, 2, size=(T, N, 1), dtype=torch.bool, device=device + ) + inputs = torch.randn((T, N, 32), device=device) + hidden_states = torch.randn( + rnn_state_encoder.num_recurrent_layers, + N, + 32, + device=device, + ) + + outputs, out_hiddens = rnn_state_encoder( + inputs.flatten(0, 1), + hidden_states.permute(1, 0, 2), + masks.flatten(0, 1), + ) + out_hiddens = out_hiddens.permute(1, 0, 2) + + reference_ouputs = [] + reference_hiddens = hidden_states.clone() + for t in range(T): + reference_hiddens = torch.where( + masks[t].view(1, -1, 1), + reference_hiddens, + reference_hiddens.new_zeros(()), + ) + + x, reference_hiddens = rnn( + inputs[t : t + 1], reference_hiddens + ) + + reference_ouputs.append(x.squeeze(0)) + + reference_ouputs = torch.stack(reference_ouputs, 0).flatten( + 0, 1 + ) + + assert ( + torch.norm(reference_ouputs - outputs).item() < 1e-3 + ), "Failed on (T={}, N={})".format(T, N) + assert ( + torch.norm(reference_hiddens - out_hiddens).item() < 1e-3 + ), "Failed on (T={}, N={})".format(T, N) diff --git a/habitat-lab-dialog/test/test_sensors.py b/habitat-lab-dialog/test/test_sensors.py new file mode 100644 index 0000000..801cfac --- /dev/null +++ b/habitat-lab-dialog/test/test_sensors.py @@ -0,0 +1,496 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +import random + +import numpy as np +import pytest +import quaternion + +import habitat +from habitat.config.default import get_config +from habitat.tasks.nav.nav import ( + MoveForwardAction, + NavigationEpisode, + NavigationGoal, +) +from habitat.utils.geometry_utils import ( + angle_between_quaternions, + quaternion_rotate_vector, +) +from habitat.utils.test_utils import sample_non_stop_action +from habitat.utils.visualizations.utils import ( + images_to_video, + observations_to_image, +) + + +def _random_episode(env, config): + random_location = env._sim.sample_navigable_point() + random_heading = np.random.uniform(-np.pi, np.pi) + random_rotation = [ + 0, + np.sin(random_heading / 2), + 0, + np.cos(random_heading / 2), + ] + env.episode_iterator = iter( + [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=random_location, + start_rotation=random_rotation, + goals=[], + ) + ] + ) + + +def test_state_sensors(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.defrost() + config.TASK.SENSORS = ["HEADING_SENSOR", "COMPASS_SENSOR", "GPS_SENSOR"] + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + env.reset() + random.seed(123) + np.random.seed(123) + + for _ in range(100): + random_heading = np.random.uniform(-np.pi, np.pi) + random_rotation = [ + 0, + np.sin(random_heading / 2), + 0, + np.cos(random_heading / 2), + ] + env.episode_iterator = iter( + [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=[03.00611, 0.072_447, -2.67867], + start_rotation=random_rotation, + goals=[], + ) + ] + ) + + obs = env.reset() + heading = obs["heading"] + assert np.allclose(heading, [random_heading]) + assert np.allclose(obs["compass"], [0.0], atol=1e-5) + assert np.allclose(obs["gps"], [0.0, 0.0], atol=1e-5) + + +def test_tactile(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.defrost() + config.TASK.SENSORS = ["PROXIMITY_SENSOR"] + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + env.reset() + random.seed(1234) + + for _ in range(20): + _random_episode(env, config) + env.reset() + + for _ in range(10): + obs = env.step(action=MoveForwardAction.name) + proximity = obs["proximity"] + assert 0.0 <= proximity + assert 2.0 >= proximity + + +def test_collisions(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.defrost() + config.TASK.MEASUREMENTS = ["COLLISIONS"] + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + env.reset() + + for _ in range(20): + _random_episode(env, config) + + env.reset() + assert env.get_metrics()["collisions"] is None + + prev_collisions = 0 + prev_loc = env.sim.get_agent_state().position + for _ in range(50): + action = sample_non_stop_action(env.action_space) + env.step(action) + collisions = env.get_metrics()["collisions"]["count"] + loc = env.sim.get_agent_state().position + if ( + np.linalg.norm(loc - prev_loc) + < 0.9 * config.SIMULATOR.FORWARD_STEP_SIZE + and action["action"] == MoveForwardAction.name + ): + # Check to see if the new method of doing collisions catches + # all the same collisions as the old method + assert collisions == prev_collisions + 1 + + # We can _never_ collide with standard turn actions + if action["action"] != MoveForwardAction.name: + assert collisions == prev_collisions + + prev_loc = loc + prev_collisions = collisions + + +def test_pointgoal_sensor(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.defrost() + config.TASK.SENSORS = ["POINTGOAL_SENSOR"] + config.TASK.POINTGOAL_SENSOR.DIMENSIONALITY = 3 + config.TASK.POINTGOAL_SENSOR.GOAL_FORMAT = "CARTESIAN" + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + + # start position is checked for validity for the specific test scene + valid_start_position = [-1.3731, 0.08431, 8.60692] + expected_pointgoal = [0.1, 0.2, 0.3] + goal_position = np.add(valid_start_position, expected_pointgoal) + + # starting quaternion is rotated 180 degree along z-axis, which + # corresponds to simulator using z-negative as forward action + start_rotation = [0, 0, 0, 1] + + env.episode_iterator = iter( + [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=valid_start_position, + start_rotation=start_rotation, + goals=[NavigationGoal(position=goal_position)], + ) + ] + ) + + env.reset() + for _ in range(100): + obs = env.step(sample_non_stop_action(env.action_space)) + pointgoal = obs["pointgoal"] + # check to see if taking non-stop actions will affect static point_goal + assert np.allclose(pointgoal, expected_pointgoal) + + +def test_pointgoal_with_gps_compass_sensor(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.defrost() + config.TASK.SENSORS = [ + "POINTGOAL_WITH_GPS_COMPASS_SENSOR", + "COMPASS_SENSOR", + "GPS_SENSOR", + "POINTGOAL_SENSOR", + ] + config.TASK.POINTGOAL_WITH_GPS_COMPASS_SENSOR.DIMENSIONALITY = 3 + config.TASK.POINTGOAL_WITH_GPS_COMPASS_SENSOR.GOAL_FORMAT = "CARTESIAN" + + config.TASK.POINTGOAL_SENSOR.DIMENSIONALITY = 3 + config.TASK.POINTGOAL_SENSOR.GOAL_FORMAT = "CARTESIAN" + + config.TASK.GPS_SENSOR.DIMENSIONALITY = 3 + + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + # start position is checked for validity for the specific test scene + valid_start_position = [-1.3731, 0.08431, 8.60692] + expected_pointgoal = [0.1, 0.2, 0.3] + goal_position = np.add(valid_start_position, expected_pointgoal) + + # starting quaternion is rotated 180 degree along z-axis, which + # corresponds to simulator using z-negative as forward action + start_rotation = [0, 0, 0, 1] + + env.episode_iterator = iter( + [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=valid_start_position, + start_rotation=start_rotation, + goals=[NavigationGoal(position=goal_position)], + ) + ] + ) + + env.reset() + for _ in range(100): + obs = env.step(sample_non_stop_action(env.action_space)) + pointgoal = obs["pointgoal"] + pointgoal_with_gps_compass = obs["pointgoal_with_gps_compass"] + compass = float(obs["compass"][0]) + gps = obs["gps"] + # check to see if taking non-stop actions will affect static point_goal + assert np.allclose( + pointgoal_with_gps_compass, + quaternion_rotate_vector( + quaternion.from_rotation_vector( + compass * np.array([0, 1, 0]) + ).inverse(), + pointgoal - gps, + ), + atol=1e-5, + ) + + +def test_imagegoal_sensor(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.defrost() + config.TASK.SENSORS = ["IMAGEGOAL_SENSOR"] + config.SIMULATOR.AGENT_0.SENSORS = ["RGB_SENSOR"] + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + + # start position is checked for validity for the specific test scene + valid_start_position = [-1.3731, 0.08431, 8.60692] + pointgoal = [0.1, 0.2, 0.3] + goal_position = np.add(valid_start_position, pointgoal) + + pointgoal_2 = [0.3, 0.2, 0.1] + goal_position_2 = np.add(valid_start_position, pointgoal_2) + + # starting quaternion is rotated 180 degree along z-axis, which + # corresponds to simulator using z-negative as forward action + start_rotation = [0, 0, 0, 1] + + env.episode_iterator = iter( + [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=valid_start_position, + start_rotation=start_rotation, + goals=[NavigationGoal(position=goal_position)], + ), + NavigationEpisode( + episode_id="1", + scene_id=config.SIMULATOR.SCENE, + start_position=valid_start_position, + start_rotation=start_rotation, + goals=[NavigationGoal(position=goal_position_2)], + ), + ] + ) + obs = env.reset() + for _ in range(100): + new_obs = env.step(sample_non_stop_action(env.action_space)) + # check to see if taking non-stop actions will affect static image_goal + assert np.allclose(obs["imagegoal"], new_obs["imagegoal"]) + assert np.allclose(obs["rgb"].shape, new_obs["imagegoal"].shape) + + previous_episode_obs = obs + _ = env.reset() + for _ in range(10): + new_obs = env.step(sample_non_stop_action(env.action_space)) + # check to see if taking non-stop actions will affect static image_goal + assert not np.allclose( + previous_episode_obs["imagegoal"], new_obs["imagegoal"] + ) + assert np.allclose( + previous_episode_obs["rgb"].shape, new_obs["imagegoal"].shape + ) + + +def test_get_observations_at(): + config = get_config() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + config.defrost() + config.TASK.SENSORS = [] + config.SIMULATOR.AGENT_0.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] + config.freeze() + with habitat.Env(config=config, dataset=None) as env: + + # start position is checked for validity for the specific test scene + valid_start_position = [-1.3731, 0.08431, 8.60692] + expected_pointgoal = [0.1, 0.2, 0.3] + goal_position = np.add(valid_start_position, expected_pointgoal) + + # starting quaternion is rotated 180 degree along z-axis, which + # corresponds to simulator using z-negative as forward action + start_rotation = [0, 0, 0, 1] + + env.episode_iterator = iter( + [ + NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=valid_start_position, + start_rotation=start_rotation, + goals=[NavigationGoal(position=goal_position)], + ) + ] + ) + + obs = env.reset() + start_state = env.sim.get_agent_state() + for _ in range(100): + # Note, this test will not currently work for camera change actions + # (look up/down), only for movement actions. + new_obs = env.step(sample_non_stop_action(env.action_space)) + for key, val in new_obs.items(): + agent_state = env.sim.get_agent_state() + if not ( + np.allclose(agent_state.position, start_state.position) + and np.allclose(agent_state.rotation, start_state.rotation) + ): + assert not np.allclose(val, obs[key]) + obs_at_point = env.sim.get_observations_at( + start_state.position, + start_state.rotation, + keep_agent_at_new_pose=False, + ) + for key, val in obs_at_point.items(): + assert np.allclose(val, obs[key]) + + obs_at_point = env.sim.get_observations_at( + start_state.position, + start_state.rotation, + keep_agent_at_new_pose=True, + ) + for key, val in obs_at_point.items(): + assert np.allclose(val, obs[key]) + agent_state = env.sim.get_agent_state() + assert np.allclose(agent_state.position, start_state.position) + assert np.allclose(agent_state.rotation, start_state.rotation) + + +def test_noise_models_rgbd(): + DEMO_MODE = False + N_STEPS = 100 + + config = get_config() + config.defrost() + config.SIMULATOR.SCENE = ( + "data/scene_datasets/habitat-test-scenes/skokloster-castle.glb" + ) + config.SIMULATOR.AGENT_0.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] + config.freeze() + if not os.path.exists(config.SIMULATOR.SCENE): + pytest.skip("Please download Habitat test data to data folder.") + + valid_start_position = [-1.3731, 0.08431, 8.60692] + + expected_pointgoal = [0.1, 0.2, 0.3] + goal_position = np.add(valid_start_position, expected_pointgoal) + + # starting quaternion is rotated 180 degree along z-axis, which + # corresponds to simulator using z-negative as forward action + start_rotation = [0, 0, 0, 1] + test_episode = NavigationEpisode( + episode_id="0", + scene_id=config.SIMULATOR.SCENE, + start_position=valid_start_position, + start_rotation=start_rotation, + goals=[NavigationGoal(position=goal_position)], + ) + + print(f"{test_episode}") + with habitat.Env(config=config, dataset=None) as env: + + env.episode_iterator = iter([test_episode]) + no_noise_obs = [env.reset()] + no_noise_states = [env.sim.get_agent_state()] + + actions = [ + sample_non_stop_action(env.action_space) for _ in range(N_STEPS) + ] + for action in actions: + no_noise_obs.append(env.step(action)) + no_noise_states.append(env.sim.get_agent_state()) + env.close() + + config.defrost() + + config.SIMULATOR.RGB_SENSOR.NOISE_MODEL = "GaussianNoiseModel" + config.SIMULATOR.RGB_SENSOR.NOISE_MODEL_KWARGS = habitat.Config() + config.SIMULATOR.RGB_SENSOR.NOISE_MODEL_KWARGS.INTENSITY_CONSTANT = 0.5 + config.SIMULATOR.DEPTH_SENSOR.NOISE_MODEL = "RedwoodDepthNoiseModel" + + config.SIMULATOR.ACTION_SPACE_CONFIG = "pyrobotnoisy" + config.SIMULATOR.NOISE_MODEL = habitat.Config() + config.SIMULATOR.NOISE_MODEL.ROBOT = "LoCoBot" + config.SIMULATOR.NOISE_MODEL.CONTROLLER = "Proportional" + config.SIMULATOR.NOISE_MODEL.NOISE_MULTIPLIER = 0.5 + + config.freeze() + + env = habitat.Env(config=config, dataset=None) + + env.episode_iterator = iter([test_episode]) + + obs = env.reset() + assert np.linalg.norm( + obs["rgb"].astype(np.float) + - no_noise_obs[0]["rgb"].astype(np.float) + ) > 1.5e-2 * np.linalg.norm( + no_noise_obs[0]["rgb"].astype(np.float) + ), "No RGB noise detected." + + assert np.linalg.norm( + obs["depth"].astype(np.float) + - no_noise_obs[0]["depth"].astype(np.float) + ) > 1.5e-2 * np.linalg.norm( + no_noise_obs[0]["depth"].astype(np.float) + ), "No Depth noise detected." + + images = [] + state = env.sim.get_agent_state() + angle_diffs = [] + pos_diffs = [] + for action in actions: + prev_state = state + obs = env.step(action) + state = env.sim.get_agent_state() + position_change = np.linalg.norm( + np.array(state.position) - np.array(prev_state.position), ord=2 + ) + + if action["action"][:5] == "TURN_": + angle_diff = abs( + angle_between_quaternions( + state.rotation, prev_state.rotation + ) + - np.deg2rad(config.SIMULATOR.TURN_ANGLE) + ) + angle_diffs.append(angle_diff) + else: + pos_diffs.append( + abs(position_change - config.SIMULATOR.FORWARD_STEP_SIZE) + ) + + if DEMO_MODE: + images.append(observations_to_image(obs, {})) + + if DEMO_MODE: + images_to_video(images, "data/video/test_noise", "test_noise") + + assert ( + np.mean(angle_diffs) > 0.025 + ), "No turn action actuation noise detected." + assert ( + np.mean(pos_diffs) > 0.025 + ), "No forward action actuation noise detected." diff --git a/habitat-lab-dialog/test/test_spaces.py b/habitat-lab-dialog/test/test_spaces.py new file mode 100644 index 0000000..c110461 --- /dev/null +++ b/habitat-lab-dialog/test/test_spaces.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import gym + +from habitat.core.spaces import ActionSpace, EmptySpace, ListSpace + + +def test_empty_space(): + space = EmptySpace() + assert space.contains(space.sample()) + assert space.contains(None) + assert not space.contains(0) + + +def test_action_space(): + space = ActionSpace( + { + "move": gym.spaces.Dict( + { + "position": gym.spaces.Discrete(2), + "velocity": gym.spaces.Discrete(3), + } + ), + "move_forward": EmptySpace(), + } + ) + assert space.contains(space.sample()) + assert space.contains( + {"action": "move", "action_args": {"position": 0, "velocity": 1}} + ) + assert space.contains({"action": "move_forward"}) + assert not space.contains([0, 1, 2]) + assert not space.contains({"zero": None}) + assert not space.contains({"action": "bad"}) + assert not space.contains({"action": "move"}) + assert not space.contains( + {"action": "move", "action_args": {"position": 0}} + ) + assert not space.contains( + {"action": "move_forward", "action_args": {"position": 0}} + ) + + +def test_list_space(): + space = ListSpace(gym.spaces.Discrete(2), 5, 10) + assert space.contains(space.sample()) + assert not space.contains(0) + assert not space.contains([0] * 4) + assert not space.contains([2] * 5) + assert not space.contains([1] * 11) diff --git a/habitat-lab-dialog/test/test_tensor_dict.py b/habitat-lab-dialog/test/test_tensor_dict.py new file mode 100644 index 0000000..96e8148 --- /dev/null +++ b/habitat-lab-dialog/test/test_tensor_dict.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import pytest + +try: + import torch +except ImportError: + torch = None + +try: + from habitat_baselines.common.tensor_dict import TensorDict +except ImportError: + pass + + +@pytest.mark.skipif(torch is None, reason="Test requires pytorch") +def test_tensor_dict_constructor(): + dict_tree = dict( + a=torch.randn(2, 2), b=dict(c=dict(d=np.random.randn(3, 3))) + ) + tensor_dict = TensorDict.from_tree(dict_tree) + + assert torch.is_tensor(tensor_dict["a"]) + assert isinstance(tensor_dict["b"], TensorDict) + assert isinstance(tensor_dict["b"]["c"], TensorDict) + assert torch.is_tensor(tensor_dict["b"]["c"]["d"]) + + +@pytest.mark.skipif(torch is None, reason="Test requires pytorch") +def test_tensor_dict_to_tree(): + dict_tree = dict(a=torch.randn(2, 2), b=dict(c=dict(d=torch.randn(3, 3)))) + + assert dict_tree == TensorDict.from_tree(dict_tree).to_tree() + + +@pytest.mark.skipif(torch is None, reason="Test requires pytorch") +def test_tensor_dict_str_index(): + dict_tree = dict(a=torch.randn(2, 2), b=dict(c=dict(d=torch.randn(3, 3)))) + tensor_dict = TensorDict.from_tree(dict_tree) + + x = torch.randn(5, 5) + tensor_dict["a"] = x + assert (tensor_dict["a"] == x).all() + + with pytest.raises(KeyError): + _ = tensor_dict["c"] + + +@pytest.mark.skipif(torch is None, reason="Test requires pytorch") +def test_tensor_dict_index(): + dict_tree = dict(a=torch.randn(2, 2), b=dict(c=dict(d=torch.randn(3, 3)))) + tensor_dict = TensorDict.from_tree(dict_tree) + + with pytest.raises(KeyError): + tensor_dict["b"][0] = dict(q=torch.randn(3)) + + tmp = dict(c=dict(d=torch.randn(3))) + tensor_dict["b"][0] = tmp + assert torch.allclose(tensor_dict["b"]["c"]["d"][0], tmp["c"]["d"]) + assert not torch.allclose(tensor_dict["b"]["c"]["d"][1], tmp["c"]["d"]) + + tensor_dict["b"]["c"]["x"] = torch.randn(5, 5) + with pytest.raises(KeyError): + tensor_dict["b"][1] = tmp + + tensor_dict["b"].set(1, tmp, strict=False) + assert torch.allclose(tensor_dict["b"]["c"]["d"][1], tmp["c"]["d"]) + + tmp = dict(c=dict(d=torch.randn(1, 3))) + del tensor_dict["b"]["c"]["x"] + tensor_dict["b"][2:3] = tmp + assert torch.allclose(tensor_dict["b"]["c"]["d"][2:3], tmp["c"]["d"]) + + +@pytest.mark.skipif(torch is None, reason="Test requires pytorch") +def test_tensor_dict_map(): + dict_tree = dict(a=dict(b=[0])) + tensor_dict = TensorDict.from_tree(dict_tree) + + res = tensor_dict.map(lambda x: x + 1) + assert (res["a"]["b"] == 1).all() + + tensor_dict.map_in_place(lambda x: x + 1) + + assert res == tensor_dict diff --git a/habitat-lab-dialog/test/test_visual_utils.py b/habitat-lab-dialog/test/test_visual_utils.py new file mode 100644 index 0000000..0400717 --- /dev/null +++ b/habitat-lab-dialog/test/test_visual_utils.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np + +from habitat.utils.visualizations.utils import observations_to_image + + +def test_observations_to_image(): + observations = { + "rgb": np.random.rand(200, 400, 3), + "depth": np.random.rand(200, 400, 1), + } + info = { + "collisions": {"is_collision": True}, + "top_down_map": { + "map": np.random.randint(low=0, high=255, size=(300, 300)), + "fog_of_war_mask": np.random.randint( + low=0, high=1, size=(300, 300) + ), + "agent_map_coord": (10, 10), + "agent_angle": np.random.random(), + }, + } + image = observations_to_image(observations, info) + assert image.shape == ( + 200, + 1000, + 3, + ), "Resulted image resolution doesn't match." diff --git a/images/task_avlen.png b/images/task_avlen.png new file mode 100644 index 0000000..d72a428 Binary files /dev/null and b/images/task_avlen.png differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..76e4ce4 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,33 @@ +# Copyright (C) 2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +# Requirements from habitat-lab-dialog +gym>=0.17.3 +numpy>=1.16.1 +yacs>=0.1.5 +numpy-quaternion>=2019.3.18.14.33.20 +attrs>=19.1.0 +opencv-python>=3.3.0 +pickle5 +# visualization optional dependencies +imageio>=2.2.0 +imageio-ffmpeg>=0.2.0 +# scipy>=1.0.0 +tqdm>=4.0.0 +numba>=0.44.0 + +# Additional requirements from habitat-sim +pillow +scipy>=1.3.0 +matplotlib +gitpython + +# Additional requirements from CLIP +ftfy +regex +torch>=1.7.1 +torchvision + +# Additional requirements from Matterport3D Simulator +pybind11 diff --git a/res/Material Map.txt b/res/Material Map.txt new file mode 100644 index 0000000..c539488 --- /dev/null +++ b/res/Material Map.txt @@ -0,0 +1,262 @@ + +namespace materials +{ + const AudioMaterial default = + { + { 0.1f }, // Absorption + { 0.5f }, // Scattering + { 0.0f } // Transmission + }; + const AudioMaterial acousticTile = + { + { {125.0f,0.5f}, {250.0f,0.7f}, {500.0f,0.6f}, {1000.0f,0.7f}, {2000.0f,0.7f}, {4000.0f,0.5f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.15f}, {500.0f,0.2f}, {1000.0f,0.2f}, {2000.0f,0.25f}, {4000.0f,0.3f} }, // Scattering + { {125.0f,0.05f}, {250.0f,0.04f}, {500.0f,0.03f}, {1000.0f,0.02f}, {2000.0f,0.005f}, {4000.0f,0.002f} } // Transmission + }; + const AudioMaterial brick = + { + { {125.0f,0.02f}, {250.0f,0.02f}, {500.0f,0.03f}, {1000.0f,0.04f}, {2000.0f,0.05f}, {4000.0f,0.07f} }, // Absorption + { {125.0f,0.2f}, {250.0f,0.25f}, {500.0f,0.30f}, {1000.0f,0.35f}, {2000.0f,0.4f}, {4000.0f,0.45f} }, // Scattering + { {125.0f,0.025f}, {250.0f,0.019f}, {500.0f,0.01f}, {1000.0f,0.0045f}, {2000.0f,0.0018f}, {4000.0f,0.0089f} } // Transmission + }; + const AudioMaterial brickPainted = + { + { {125.0f,0.01f}, {250.0f,0.01f}, {500.0f,0.02f}, {1000.0f,0.02f}, {2000.0f,0.02f}, {4000.0f,0.03f} }, // Absorption + { {125.0f,0.15f}, {250.0f,0.15f}, {500.0f,0.2f}, {1000.0f,0.2f}, {2000.0f,0.2f}, {4000.0f,0.25f} }, // Scattering + { {125.0f,0.025f}, {250.0f,0.019f}, {500.0f,0.01f}, {1000.0f,0.0045f}, {2000.0f,0.0018f}, {4000.0f,0.0089f} } // Transmission + }; + const AudioMaterial carpet = + { + { {125.0f,0.01f}, {250.0f,0.05f}, {500.0f,0.1f}, {1000.0f,0.2f}, {2000.0f,0.45f}, {4000.0f,0.65f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.1f}, {500.0f,0.15f}, {1000.0f,0.2f}, {2000.0f,0.3f}, {4000.0f,0.45f} }, // Scattering + { {125.0f,0.004f}, {250.0f,0.0079f}, {500.0f,0.0056f}, {1000.0f,0.0016f}, {2000.0f,0.0014f}, {4000.0f,0.0005f} } // Transmission + }; + const AudioMaterial carpetHeavy = + { + { {125.0f,0.02f}, {250.0f,0.06f}, {500.0f,0.14f}, {1000.0f,0.37f}, {2000.0f,0.48f}, {4000.0f,0.63f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.15f}, {500.0f,0.20f}, {1000.0f,0.25f}, {2000.0f,0.35f}, {4000.0f,0.5f} }, // Scattering + { {125.0f,0.004f}, {250.0f,0.0079f}, {500.0f,0.0056f}, {1000.0f,0.0016f}, {2000.0f,0.0014f}, {4000.0f,0.0005f} } // Transmission + }; + const AudioMaterial carpetHeavyPadded = + { + { {125.0f,0.08f}, {250.0f,0.24f}, {500.0f,0.57f}, {1000.0f,0.69f}, {2000.0f,0.71f}, {4000.0f,0.73f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.15f}, {500.0f,0.20f}, {1000.0f,0.25f}, {2000.0f,0.35f}, {4000.0f,0.5f} }, // Scattering + { {125.0f,0.004f}, {250.0f,0.0079f}, {500.0f,0.0056f}, {1000.0f,0.0016f}, {2000.0f,0.0014f}, {4000.0f,0.0005f} } // Transmission + }; + const AudioMaterial ceramicTile = + { + { {125.0f,0.01f}, {250.0f,0.01f}, {500.0f,0.01f}, {1000.0f,0.01f}, {2000.0f,0.02f}, {4000.0f,0.02f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.12f}, {500.0f,0.14f}, {1000.0f,0.16f}, {2000.0f,0.18f}, {4000.0f,0.2f} }, // Scattering + { {125.0f,0.004f}, {250.0f,0.0079f}, {500.0f,0.0056f}, {1000.0f,0.0016f}, {2000.0f,0.0014f}, {4000.0f,0.0005f} } // Transmission + }; + const AudioMaterial concrete = + { + { {125.0f,0.01f}, {250.0f,0.01f}, {500.0f,0.01f}, {1000.0f,0.02f}, {2000.0f,0.02f}, {4000.0f,0.02f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.11f}, {500.0f,0.12f}, {1000.0f,0.13f}, {2000.0f,0.14f}, {4000.0f,0.15f} }, // Scattering + { {125.0f,0.004f}, {250.0f,0.0079f}, {500.0f,0.0056f}, {1000.0f,0.0016f}, {2000.0f,0.0014f}, {4000.0f,0.0005f} } // Transmission + }; + const AudioMaterial concreteRough = + { + { {125.0f,0.01f}, {250.0f,0.02f}, {500.0f,0.04f}, {1000.0f,0.06f}, {2000.0f,0.08f}, {4000.0f,0.1f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.12f}, {500.0f,0.15f}, {1000.0f,0.2f}, {2000.0f,0.25f}, {4000.0f,0.30f} }, // Scattering + { {125.0f,0.004f}, {250.0f,0.0079f}, {500.0f,0.0056f}, {1000.0f,0.0016f}, {2000.0f,0.0014f}, {4000.0f,0.0005f} } // Transmission + }; + const AudioMaterial concreteBlock = + { + { {125.0f,0.36f}, {250.0f,0.44f}, {500.0f,0.31f}, {1000.0f,0.29f}, {2000.0f,0.39f}, {4000.0f,0.21f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.12f}, {500.0f,0.15f}, {1000.0f,0.2f}, {2000.0f,0.3f}, {4000.0f,0.4f} }, // Scattering + { {125.0f,0.02f}, {250.0f,0.01f}, {500.0f,0.0063f}, {1000.0f,0.0035f}, {2000.0f,0.0011f}, {4000.0f,0.0006f} } // Transmission + }; + const AudioMaterial concreteBlockPainted = + { + { {125.0f,0.1f}, {250.0f,0.05f}, {500.0f,0.06f}, {1000.0f,0.07f}, {2000.0f,0.09f}, {4000.0f,0.08f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.11f}, {500.0f,0.13f}, {1000.0f,0.15f}, {2000.0f,0.16f}, {4000.0f,0.2f} }, // Scattering + { {125.0f,0.02f}, {250.0f,0.01f}, {500.0f,0.0063f}, {1000.0f,0.0035f}, {2000.0f,0.0011f}, {4000.0f,0.0006f} } // Transmission + }; + const AudioMaterial curtain = + { + { {125.0f,0.07f}, {250.0f,0.31f}, {500.0f,0.49f}, {1000.0f,0.75f}, {2000.0f,0.7f}, {4000.0f,0.6f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.15f}, {500.0f,0.2f}, {1000.0f,0.3f}, {2000.0f,0.4f}, {4000.0f,0.5f} }, // Scattering + { {125.0f,0.42f}, {250.0f,0.39f}, {500.0f,0.21f}, {1000.0f,0.14f}, {2000.0f,0.079f}, {4000.0f,0.045f} } // Transmission + }; + const AudioMaterial foliage = + { + { {125.0f,0.03f}, {250.0f,0.06f}, {500.0f,0.11f}, {1000.0f,0.17f}, {2000.0f,0.27f}, {4000.0f,0.31f} }, // Absorption + { {125.0f,0.2f}, {250.0f,0.3f}, {500.0f,0.4f}, {1000.0f,0.5f}, {2000.0f,0.7f}, {4000.0f,0.8f} }, // Scattering + { {125.0f,0.9f}, {250.0f,0.9f}, {500.0f,0.9f}, {1000.0f,0.8f}, {2000.0f,0.5f}, {4000.0f,0.3f} } // Transmission + }; + const AudioMaterial glass = + { + { {125.0f,0.35f}, {250.0f,0.25f}, {500.0f,0.18f}, {1000.0f,0.12f}, {2000.0f,0.07f}, {4000.0f,0.05f} }, // Absorption + { {125.0f,0.05f}, {250.0f,0.05f}, {500.0f,0.05f}, {1000.0f,0.05f}, {2000.0f,0.05f}, {4000.0f,0.05f} }, // Scattering + { {125.0f,0.125f}, {250.0f,0.089f}, {500.0f,0.05f}, {1000.0f,0.028f}, {2000.0f,0.022f}, {4000.0f,0.079f} } // Transmission + }; + const AudioMaterial glassHeavy = + { + { {125.0f,0.18f}, {250.0f,0.06f}, {500.0f,0.04f}, {1000.0f,0.03f}, {2000.0f,0.02f}, {4000.0f,0.02f} }, // Absorption + { {125.0f,0.05f}, {250.0f,0.05f}, {500.0f,0.05f}, {1000.0f,0.05f}, {2000.0f,0.05f}, {4000.0f,0.05f} }, // Scattering + { {125.0f,0.056f}, {250.0f,0.039f}, {500.0f,0.028f}, {1000.0f,0.02f}, {2000.0f,0.032f}, {4000.0f,0.014f} } // Transmission + }; + const AudioMaterial grass = + { + { {125.0f,0.11f}, {250.0f,0.26f}, {500.0f,0.6f}, {1000.0f,0.69f}, {2000.0f,0.92f}, {4000.0f,0.99f} }, // Absorption + { {125.0f,0.3f}, {250.0f,0.3f}, {500.0f,0.4f}, {1000.0f,0.5f}, {2000.0f,0.6f}, {4000.0f,0.7f} }, // Scattering + { {125.0f,0.0f}, {250.0f,0.0f}, {500.0f,0.0f}, {1000.0f,0.0f}, {2000.0f,0.0f}, {4000.0f,0.0f} } // Transmission + }; + const AudioMaterial gravel = + { + { {125.0f,0.25f}, {250.0f,0.6f}, {500.0f,0.65f}, {1000.0f,0.70f}, {2000.0f,0.75f}, {4000.0f,0.80f} }, // Absorption + { {125.0f,0.2f}, {250.0f,0.3f}, {500.0f,0.4f}, {1000.0f,0.5f}, {2000.0f,0.6f}, {4000.0f,0.7f} }, // Scattering + { {125.0f,0.0f}, {250.0f,0.0f}, {500.0f,0.0f}, {1000.0f,0.0f}, {2000.0f,0.0f}, {4000.0f,0.0f} } // Transmission + }; + const AudioMaterial gypsumBoard = + { + { {125.0f,0.29f}, {250.0f,0.10f}, {500.0f,0.05f}, {1000.0f,0.04f}, {2000.0f,0.07f}, {4000.0f,0.09f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.11f}, {500.0f,0.12f}, {1000.0f,0.13f}, {2000.0f,0.14f}, {4000.0f,0.15f} }, // Scattering + { {125.0f,0.035f}, {250.0f,0.0125f}, {500.0f,0.0056f}, {1000.0f,0.0025f}, {2000.0f,0.0013f}, {4000.0f,0.0032f} } // Transmission + }; + const AudioMaterial plasterOnBrick = + { + { {125.0f,0.01f}, {250.0f,0.02f}, {500.0f,0.02f}, {1000.0f,0.03f}, {2000.0f,0.04f}, {4000.0f,0.05f} }, // Absorption + { {125.0f,0.2f}, {250.0f,0.25f}, {500.0f,0.3f}, {1000.0f,0.35f}, {2000.0f,0.4f}, {4000.0f,0.45f} }, // Scattering + { {125.0f,0.025f}, {250.0f,0.019f}, {500.0f,0.01f}, {1000.0f,0.0045f}, {2000.0f,0.0018f}, {4000.0f,0.00089f} } // Transmission + }; + const AudioMaterial plasterOnConcreteBlock = + { + { {125.0f,0.12f}, {250.0f,0.09f}, {500.0f,0.07f}, {1000.0f,0.05f}, {2000.0f,0.05f}, {4000.0f,0.04f} }, // Absorption + { {125.0f,0.2f}, {250.0f,0.25f}, {500.0f,0.3f}, {1000.0f,0.35f}, {2000.0f,0.4f}, {4000.0f,0.45f} }, // Scattering + { {125.0f,0.02f}, {250.0f,0.01f}, {500.0f,0.0063f}, {1000.0f,0.0035f}, {2000.0f,0.00011f}, {4000.0f,0.00063f} } // Transmission + }; + const AudioMaterial soil = + { + { {125.0f,0.15f}, {250.0f,0.25f}, {500.0f,0.4f}, {1000.0f,0.55f}, {2000.0f,0.6f}, {4000.0f,0.6f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.2f}, {500.0f,0.25f}, {1000.0f,0.4f}, {2000.0f,0.55f}, {4000.0f,0.7f} }, // Scattering + { {125.0f,0.0f}, {250.0f,0.0f}, {500.0f,0.0f}, {1000.0f,0.0f}, {2000.0f,0.0f}, {4000.0f,0.0f} } // Transmission + }; + const AudioMaterial soundProof = + { + { {125.0f,1.0f}, {250.0f,1.0f}, {500.0f,1.0f}, {1000.0f,1.0f}, {2000.0f,1.0f}, {4000.0f,1.0f} }, // Absorption + { {125.0f,0.0f}, {250.0f,0.0f}, {500.0f,0.0f}, {1000.0f,0.0f}, {2000.0f,0.0f}, {4000.0f,0.0f} }, // Scattering + { {125.0f,0.0f}, {250.0f,0.0f}, {500.0f,0.0f}, {1000.0f,0.0f}, {2000.0f,0.0f}, {4000.0f,0.0f} } // Transmission + }; + const AudioMaterial snow = + { + { {125.0f,0.45f}, {250.0f,0.75f}, {500.0f,0.90f}, {1000.0f,0.95f}, {2000.0f,0.95f}, {4000.0f,0.95f} }, // Absorption + { {125.0f,0.2f}, {250.0f,0.3f}, {500.0f,0.4f}, {1000.0f,0.5f}, {2000.0f,0.6f}, {4000.0f,0.7f} }, // Scattering + { {125.0f,0.0f}, {250.0f,0.0f}, {500.0f,0.0f}, {1000.0f,0.0f}, {2000.0f,0.0f}, {4000.0f,0.0f} } // Transmission + }; + const AudioMaterial steel = + { + { {125.0f,0.05f}, {250.0f,0.10f}, {500.0f,0.10f}, {1000.0f,0.10f}, {2000.0f,0.07f}, {4000.0f,0.02f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.1f}, {500.0f,0.1f}, {1000.0f,0.1f}, {2000.0f,0.1f}, {4000.0f,0.1f} }, // Scattering + { {125.0f,0.25f}, {250.0f,0.2f}, {500.0f,0.17f}, {1000.0f,0.089f}, {2000.0f,0.089f}, {4000.0f,0.056f} } // Transmission + }; + const AudioMaterial water = + { + { {125.0f,0.01f}, {250.0f,0.01f}, {500.0f,0.01f}, {1000.0f,0.02f}, {2000.0f,0.02f}, {4000.0f,0.03f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.1f}, {500.0f,0.1f}, {1000.0f,0.07f}, {2000.0f,0.05f}, {4000.0f,0.05f} }, // Scattering + { {125.0f,0.03f}, {250.0f,0.03f}, {500.0f,0.03f}, {1000.0f,0.02f}, {2000.0f,0.015f}, {4000.0f,0.01f} } // Transmission + }; + const AudioMaterial woodFloor = + { + { {125.0f,0.15f}, {250.0f,0.11f}, {500.0f,0.10f}, {1000.0f,0.07f}, {2000.0f,0.06f}, {4000.0f,0.07f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.1f}, {500.0f,0.1f}, {1000.0f,0.1f}, {2000.0f,0.1f}, {4000.0f,0.15f} }, // Scattering + { {125.0f,0.071f}, {250.0f,0.025f}, {500.0f,0.0158f}, {1000.0f,0.0056f}, {2000.0f,0.0035f}, {4000.0f,0.0016f} } // Transmission + }; + const AudioMaterial woodOnConcrete = + { + { {125.0f,0.04f}, {250.0f,0.04f}, {500.0f,0.07f}, {1000.0f,0.06f}, {2000.0f,0.06f}, {4000.0f,0.07f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.1f}, {500.0f,0.1f}, {1000.0f,0.1f}, {2000.0f,0.1f}, {4000.0f,0.15f} }, // Scattering + { {125.0f,0.004f}, {250.0f,0.0079f}, {500.0f,0.0056f}, {1000.0f,0.0016f}, {2000.0f,0.0014f}, {4000.0f,0.0005f} } // Transmission + }; + const AudioMaterial woodThin = + { + { {125.0f,0.42f}, {250.0f,0.21f}, {500.0f,0.10f}, {1000.0f,0.08f}, {2000.0f,0.06f}, {4000.0f,0.06f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.1f}, {500.0f,0.1f}, {1000.0f,0.1f}, {2000.0f,0.1f}, {4000.0f,0.15f} }, // Scattering + { {125.0f,0.2f}, {250.0f,0.125f}, {500.0f,0.079f}, {1000.0f,0.1f}, {2000.0f,0.089f}, {4000.0f,0.05f} } // Transmission + }; + const AudioMaterial woodThick = + { + { {125.0f,0.19f}, {250.0f,0.14f}, {500.0f,0.09f}, {1000.0f,0.06f}, {2000.0f,0.06f}, {4000.0f,0.05f} }, // Absorption + { {125.0f,0.1f}, {250.0f,0.1f}, {500.0f,0.1f}, {1000.0f,0.1f}, {2000.0f,0.1f}, {4000.0f,0.15f} }, // Scattering + { {125.0f,0.035f}, {250.0f,0.028f}, {500.0f,0.028f}, {1000.0f,0.028f}, {2000.0f,0.0011f}, {4000.0f,0.0071f} } // Transmission + }; +} + +// A map from semantic class name to the associated acoustic material. +const std::map classMaterialMap = +{ + { "floor", &materials::carpet }, + { "fireplace", &materials::brick }, + { "backpack", &materials::curtain }, + { "clothes", &materials::curtain }, + { "base-cabinet", &materials::woodFloor }, + { "bathtub", &materials::steel }, + { "beam", &materials::steel }, + { "beanbag", &materials::curtain }, + { "bed", &materials::curtain }, + { "blanket", &materials::curtain }, + { "blinds", &materials::glass }, + { "mirror", &materials::glass }, + { "tv_monitor", &materials::glass }, + { "lighting", &materials::glass }, + { "cabinet", &materials::woodFloor }, + { "chair", &materials::woodThick }, + { "furniture", &materials::woodThick }, + { "chopping-board", &materials::woodThick }, + { "cloth", &materials::curtain }, + { "clothing", &materials::curtain }, + { "comforter", &materials::curtain }, + { "cushion", &materials::curtain }, + { "curtain", &materials::curtain }, + { "ceiling", &materials::acousticTile }, + { "countertop", &materials::woodThick }, + { "counter", &materials::woodThick }, + { "shelving", &materials::woodThick }, + { "desk", &materials::woodThick }, + { "door", &materials::woodThick }, + { "seating", &materials::woodThick }, + { "chest_of_drawers", &materials::woodThick }, + { "stairs", &materials::woodThick }, + { "floor", &materials::carpet }, + { "handbag", &materials::curtain }, + { "handrail", &materials::steel }, + { "railing", &materials::steel }, + { "appliances", &materials::steel }, + { "indoor-plant", &materials::foliage }, + { "major-appliance", &materials::steel }, + { "mat", &materials::carpet }, + { "microwave", &materials::steel }, + { "nightstand", &materials::woodThick }, + { "board_panel", &materials::woodThick }, + { "pipe", &materials::steel }, + { "refrigerator", &materials::steel }, + { "gym_equipment", &materials::steel }, + { "scarf", &materials::curtain }, + { "shelf", &materials::woodThick }, + { "shower-stall", &materials::ceramicTile }, + { "shower", &materials::ceramicTile }, + { "sink", &materials::steel }, + { "sofa", &materials::curtain }, + { "stair", &materials::woodFloor }, + { "stool", &materials::woodThick }, + { "table", &materials::woodThick }, + { "table-runner", &materials::woodThick }, + { "toilet", &materials::ceramicTile }, + { "wall", &materials::gypsumBoard }, + { "wardrobe", &materials::woodThick }, + { "window", &materials::glass }, + { "rug", &materials::carpetHeavy }, + { "bag", &materials::curtain }, + { "set-of-clothing", &materials::curtain }, + { "towel", &materials::curtain }, + { "plant", &materials::foliage }, + + // Materials without corresponding material type. + { "(null)", &materials::default }, + { "void", &materials::default }, + { "misc", &materials::default }, + { "objects", &materials::default }, + { "picture", &materials::default }, + { "column", &materials::default }, +}; diff --git a/res/logo.png b/res/logo.png new file mode 100644 index 0000000..c2bdde9 Binary files /dev/null and b/res/logo.png differ diff --git a/res/soundspaces-demo.gif b/res/soundspaces-demo.gif new file mode 100644 index 0000000..32cb44c Binary files /dev/null and b/res/soundspaces-demo.gif differ diff --git a/scripts/AmbisonicBinauralizer b/scripts/AmbisonicBinauralizer new file mode 100755 index 0000000..5c45fb0 Binary files /dev/null and b/scripts/AmbisonicBinauralizer differ diff --git a/scripts/ambisonic_to_binaural.py b/scripts/ambisonic_to_binaural.py new file mode 100644 index 0000000..c0bccbc --- /dev/null +++ b/scripts/ambisonic_to_binaural.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import subprocess +from concurrent.futures import ThreadPoolExecutor +import os + + +def binauralize_ambisonic_irs_with_angle(scene_ambisonic_dir, scene_binaural_dir, angle, exe_file): + angle_scene_binaural_dir = os.path.join(scene_binaural_dir, str(angle)) + + command = [exe_file, "-i", scene_ambisonic_dir, "-o", angle_scene_binaural_dir, "-a", str(angle)] + ret = subprocess.run(command, check=True, capture_output=True) + print(ret.stdout, ret.stderr) + + +def main(): + data_dir = './data' + ambisonic_dir = os.path.join(data_dir, 'ambisonic_rirs/mp3d') + binaural_dir = os.path.join(data_dir, 'binaural_rirs/mp3d') + + angles = [0, 90, 180, 270] + exe_file = './scripts/AmbisonicBinauralizer' + scenes = os.listdir(ambisonic_dir) + args = list() + for scene in scenes: + scene_ambisonic_dir = os.path.join(ambisonic_dir, scene, 'irs') + scene_binaural_dir = os.path.join(binaural_dir, scene) + os.makedirs(scene_binaural_dir, exist_ok=True) + + for angle in angles: + angle_scene_binaural_dir = os.path.join(scene_binaural_dir, str(angle)) + if os.path.exists(angle_scene_binaural_dir) and len(os.listdir(scene_ambisonic_dir)) == len( + os.listdir(angle_scene_binaural_dir)): + continue + print(angle_scene_binaural_dir) + args.append((scene_ambisonic_dir, scene_binaural_dir, angle, exe_file)) + with ThreadPoolExecutor(max_workers=160) as executor: + executor.map(binauralize_ambisonic_irs_with_angle, *[[arg[i] for arg in args] for i in range(len(args[0]))]) + + +if __name__ == '__main__': + main() diff --git a/scripts/cache_observations.py b/scripts/cache_observations.py new file mode 100644 index 0000000..529328e --- /dev/null +++ b/scripts/cache_observations.py @@ -0,0 +1,119 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, List, Optional +from abc import ABC +import os +import argparse +import logging +import pickle +from collections import defaultdict + +import numpy as np + +import habitat_sim +from habitat.core.registry import registry +from habitat.core.simulator import AgentState, Simulator +from habitat.sims.habitat_simulator.habitat_simulator import HabitatSim +from habitat_sim.utils.common import quat_to_angle_axis, quat_to_coeffs, quat_from_angle_axis, quat_from_coeffs +from habitat.tasks.nav.nav import NavigationEpisode, NavigationGoal, ShortestPathPoint +from soundspaces.tasks.audionav_task import merge_sim_episode_config +from soundspaces.utils import load_metadata +from soundspaces.simulator import SoundSpacesSim +from ss_baselines.av_nav.config import get_config +import sys + + +class Sim(SoundSpacesSim): + def step(self, action): + sim_obs = self._sim.get_sensor_observations() + return sim_obs, self._rotation_angle + + +# if want to generate rgb of 224,224 change config +# for 128*128 use default config +# python scripts/cache_observations.py --config-path ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_rgb_question.yaml --obs-dir data1/scene_observations_224/ (if 224*224) + +def main(dataset): + parser = argparse.ArgumentParser() + parser.add_argument( + "--config-path", + type=str, + default='ss_baselines/av_nav/config/audionav/{}/train_telephone/pointgoal_rgb.yaml'.format(dataset) + ) + parser.add_argument( + "--obs-dir", + type=str, + default='data/scene_observations/' + ) + parser.add_argument( + "opts", + default=None, + nargs=argparse.REMAINDER, + help="Modify config options from command line", + ) + args = parser.parse_args() + config = get_config(args.config_path, opts=args.opts) + config.defrost() + config.TASK_CONFIG.SIMULATOR.AGENT_0.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] + config.TASK_CONFIG.SIMULATOR.USE_RENDERED_OBSERVATIONS = False + config.freeze() + simulator = None + scene_obs = defaultdict(dict) + num_obs = 0 + scene_obs_dir = args.obs_dir + dataset + os.makedirs(scene_obs_dir, exist_ok=True) + metadata_dir = 'data/metadata/' + dataset + for scene in os.listdir(metadata_dir): + scene_obs = dict() + scene_metadata_dir = os.path.join(metadata_dir, scene) + points, graph = load_metadata(scene_metadata_dir) + if dataset == 'replica': + scene_mesh_dir = os.path.join('data/scene_datasets', dataset, scene, 'habitat/mesh_semantic.ply') + else: + scene_mesh_dir = os.path.join('data/scene_datasets', dataset, scene, scene + '.glb') + + for node in graph.nodes(): + agent_position = graph.nodes()[node]['point'] + for angle in [0, 90, 180, 270]: + agent_rotation = quat_to_coeffs(quat_from_angle_axis(np.deg2rad(angle), np.array([0, 1, 0]))).tolist() + goal_radius = 0.00001 + goal = NavigationGoal( + position=agent_position, + radius=goal_radius + ) + episode = NavigationEpisode( + goals=[goal], + episode_id=str(0), + scene_id=scene_mesh_dir, + start_position=agent_position, + start_rotation=agent_rotation, + info={'sound': 'telephone'} + ) + + episode_sim_config = merge_sim_episode_config(config.TASK_CONFIG.SIMULATOR, episode) + if simulator is None: + simulator = Sim(episode_sim_config) + simulator.reconfigure(episode_sim_config) + + obs, rotation_index = simulator.step(None) + scene_obs[(node, rotation_index)] = obs + num_obs += 1 + + print('Total number of observations: {}'.format(num_obs)) + with open(os.path.join(scene_obs_dir, '{}.pkl'.format(scene)), 'wb') as fo: + pickle.dump(scene_obs, fo) + simulator.close() + del simulator + + +if __name__ == '__main__': + #print('Caching Replica observations ...') + #main('replica') + print('Caching Matterport3D observations ...') + main('mp3d') diff --git a/scripts/clip_checking.py b/scripts/clip_checking.py new file mode 100644 index 0000000..809f2b9 --- /dev/null +++ b/scripts/clip_checking.py @@ -0,0 +1,27 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later +import torch +import clip +from PIL import Image +import requests + +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = clip.load("ViT-B/32", device=device) + +image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device) +#url = "http://images.cocodataset.org/val2017/000000039769.jpg" +#image = preprocess(Image.open(requests.get(url, stream=True).raw)).unsqueeze(0).to(device) + +text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) +print(text) + +with torch.no_grad(): + image_features = model.encode_image(image) + text_features = model.encode_text(text) + print(text_features.size()) + + logits_per_image, logits_per_text = model(image, text) + probs = logits_per_image.softmax(dim=-1).cpu().numpy() + +print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]] diff --git a/scripts/cont2dis_mapping.py b/scripts/cont2dis_mapping.py new file mode 100644 index 0000000..8faf776 --- /dev/null +++ b/scripts/cont2dis_mapping.py @@ -0,0 +1,151 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +from typing import Any, List, Optional +from abc import ABC +import os +import argparse +import logging +import pickle +from collections import defaultdict +import numpy as np +import sys +import networkx as nx +import json +from math import sqrt + +from soundspaces.utils import load_metadata + +r_mat = np.array([[1,0,0,0],[0,0,1,0],[0,-1,0,0],[0,0,0,1]]) + + +def load_nav_graphs_vln(scans): + ''' Load connectivity graph for each scan ''' + + def distance(pose1, pose2): + ''' Euclidean distance between two graph poses ''' + return ((pose1['pose'][3]-pose2['pose'][3])**2\ + + (pose1['pose'][7]-pose2['pose'][7])**2\ + + (pose1['pose'][11]-pose2['pose'][11])**2)**0.5 + + graphs = {} + for scan in scans: + with open('connectivity/%s_connectivity.json' % scan) as f: + G = nx.Graph() + positions = {} + data = json.load(f) + for i,item in enumerate(data): + if item['included']: + for j,conn in enumerate(item['unobstructed']): + if conn and data[j]['included']: + positions[item['image_id']] = np.array([item['pose'][3], + item['pose'][7], item['pose'][11]]); + assert data[j]['unobstructed'][i], 'Graph should be undirected' + G.add_edge(item['image_id'],data[j]['image_id'],weight=distance(item,data[j])) + nx.set_node_attributes(G, values=positions, name='position') + graphs[scan] = G + return graphs + + + + +if __name__== '__main__': + + # for metadata (points in soundspace) + dataset = 'mp3d' # not done for replica + metadata_dir = './data/metadata/' + dataset + scans = os.listdir(metadata_dir) + + # for points from graph in vln + vln_graphs = load_nav_graphs_vln(scans) + + node2view = {} + # node2view = {'': {: view}} + + for idx, scene in enumerate(scans): + if idx%5==0: + print('{}/{} scene done'.format(idx, len(scans))) + node2view[scene] = {} + + # graph from soundspace + scene_metadata_dir = os.path.join(metadata_dir, scene) + _, graph_sound = load_metadata(scene_metadata_dir) + + # graph from vln dataset + vln_G = vln_graphs[scene] + + # collect the location information from all the views in the vln_graph + with open('./connectivity/{}_connectivity.json'.format(scene), 'r') as f: + scene_connectivity = json.load(f) # list + + view_location = {} + for node_vln in vln_G.nodes(): + for view in scene_connectivity: + if node_vln == view['image_id']: + pose = np.array(view['pose']).reshape(4,4) + pose = np.matmul(r_mat, pose) + view_location[node_vln] = np.array([pose[0,3], pose[1,3], pose[2,3]]) + break + + + for node_sound in graph_sound.nodes(): + location_sound = np.array(graph_sound.nodes[node_sound]['point']) + # find the closest viewpoint + dist_all = [] + node_name_vln = [] + + for node_vln, location in view_location.items(): + if location[1]>= location_sound[1] and location[1]< location_sound[1]+2.99: + # not making sure this is in the same room + dist = np.linalg.norm(np.array([location[0], location[2]]) - np.array([location_sound[0], location_sound[2]])) + dist_all.append(dist) + node_name_vln.append(node_vln) + + dist_all = np.array(dist_all) + node2view[scene][node_sound] = node_name_vln[np.argmin(dist_all)] + # print(node2view) + # print('vln', view_location[node2view[scene][node_sound]]) + # print('sound', location_sound) + + # save the node2view dict in json file + with open('./data/node2view.json', 'w') as outfile: + json.dump(node2view, outfile) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/scripts/data_checking.py b/scripts/data_checking.py new file mode 100644 index 0000000..c66229d --- /dev/null +++ b/scripts/data_checking.py @@ -0,0 +1,80 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import numpy as np +import os +import sys +import gzip +from soundspaces.utils import load_metadata + +# episode_file = '/home/sudipta/isavi/dialog_audionav/data/datasets/semantic_audionav/mp3d/v1/train/content/1LXtFkjw3qL.json.gz' +episode_file = '/home/sudipta/isavi/dialog_audionav/data/datasets/semantic_audionav_dialog_approx/mp3d/v1/train/content/1LXtFkjw3qL.json.gz' +SEMANTIC_AUDIO_EPISODE_DIR = './data/datasets/semantic_audionav/mp3d/v1' +SPLIT = 'train' +GRAPH_DIR_PATH = './data/metadata/mp3d' + +def get_scans(path=None): + semantic_split_path = os.path.join(SEMANTIC_AUDIO_EPISODE_DIR, SPLIT, 'content') + scans = [] + if path is None: + for elem in os.listdir(semantic_split_path): + scans.append(elem.split('.')[0]) + else: + for elem in os.listdir(path): + scans.append(elem.split('.')[0]) + + return scans + +if __name__=='__main__': + ''' + with gzip.open(episode_file) as f: + json_bytes = f.read() + json_str = json_bytes.decode('utf-8') + audionav_episodes = json.loads(json_str)['episodes'] # list + + print(audionav_episodes[0]) + + ''' + scans = get_scans() + for scan in scans: + scan_path = os.path.join(GRAPH_DIR_PATH, scan) + points, sound_G = load_metadata(scan_path) + for node in sound_G.nodes(): + print(node, type(np.array(sound_G.nodes[node]['point'])), np.array(sound_G.nodes[node]['point'])) + break + + + + + +''' +episode_file = '/home/sudipta/isavi/dialog_audionav/data/datasets/semantic_audionav/mp3d/v1/train/content/1LXtFkjw3qL.json.gz' + +{'episode_id': '554', + 'scene_id': '1LXtFkjw3qL/1LXtFkjw3qL.glb', + 'start_position': [-0.73757, 0.03187399999999996, 7.43675], + 'start_rotation': [0.0, 1.0, 0.0, 6.123233995736766e-17], + 'info': {'geodesic_distance': 9.0, 'num_action': 12}, + 'goals': [{'position': [1.26243, 0.033528, 14.43675], 'radius': 1e-05, 'object_id': 242, 'object_name': None, + 'object_category': None, 'room_id': None, 'room_name': None, + 'view_points': [[1.26243, 0.033528, 14.43675], [1.26243, 0.03187399999999996, 13.43675], + [1.26243, 0.03187399999999996, 12.43675], [1.26243, 0.03187399999999996, 11.43675], + [1.26243, 0.03187399999999996, 10.43675], [1.26243, 0.03187399999999996, 9.43675]]}], + 'start_room': None, 'shortest_paths': None, 'object_category': 'table', 'sound_id': 'train/table.wav', + 'offset': '10', 'duration': '22'} + +episode_file = '/home/sudipta/isavi/dialog_audionav/data/datasets/semantic_audionav_dialog_approx/mp3d/v1/train/content/1LXtFkjw3qL.json.gz' + +{'episode_id': '326', +'scene_id': '1LXtFkjw3qL/1LXtFkjw3qL.glb', +'start_position': [-6.73757, -0.012620000000000076, 11.43675], +'start_rotation': [0, -0.7722588196467437, 0, -0.6353080477042756], +'info': {'geodesic_distance': 9.0, 'num_action': 13}, +'goals': [{'position': [-3.73757, -0.012620000000000076, 7.43675], 'radius': 1e-05, 'object_id': 233, + 'object_name': None, 'object_category': None, 'room_id': None, 'room_name': None, + 'view_points': [[-3.73757, -0.012620000000000076, 7.43675]]}], + 'start_room': None, 'shortest_paths': None, 'object_category': 'chair', 'sound_id': 'train/chair.wav', + 'offset': '2', 'duration': '7', 'dialog_node': [97, 120], 'sub_instr': 'go inside use the door on the left'} +''' diff --git a/scripts/download_data.py b/scripts/download_data.py new file mode 100644 index 0000000..dde74e6 --- /dev/null +++ b/scripts/download_data.py @@ -0,0 +1,43 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import collections +import os +import subprocess + + +def download_and_uncompress(url, output_dir): + scene_file = os.path.basename(url) + print(f'Downloading {url} ...') + if not os.path.exists(scene_file): + subprocess.run(['wget', url]) + subprocess.run(['tar', '-xzf', scene_file]) + subprocess.run(['rm', scene_file]) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--output-dir', default='data') + parser.add_argument('--dataset', default='mp3d', choices=['mp3d', 'replica']) + parser.add_argument('--rir-type', default='binaural_rirs', choices=['binaural_rirs', 'ambisonic_rirs']) + args = parser.parse_args() + + dataset_rir_dir = os.path.join(args.output_dir, args.rir_type, args.dataset) + aws_root_dir = 'http://dl.fbaipublicfiles.com/SoundSpaces/' + scenes = os.listdir(os.path.join('data/metadata/', args.dataset)) + for scene in scenes: + scene_file = os.path.join(aws_root_dir, args.rir_type, args.dataset, scene + '.tar.gz') + if os.path.exists(os.path.join(dataset_rir_dir, scene)): + continue + else: + download_and_uncompress(scene_file, args.output_dir) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/scripts/generate_matterport3d_metadata.py b/scripts/generate_matterport3d_metadata.py new file mode 100644 index 0000000..bd8f554 --- /dev/null +++ b/scripts/generate_matterport3d_metadata.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import pickle +import os + +import numpy as np +import networkx as nx +import matplotlib.pyplot as plt + +import habitat_sim +import habitat_sim.bindings as hsim +from networkx import connected_components +from soundspaces.utils import load_metadata + + +def adjust_graph(graph, points, name): + subgraphs = [graph.subgraph(c) for c in connected_components(graph)] + nodes_to_remove = [] + edges_to_remove = [] + for subgraph in subgraphs: + if len(subgraph.nodes) < 10: + nodes_to_remove += subgraph.nodes + edges_to_remove += subgraph.edges + graph.remove_nodes_from(nodes_to_remove) + graph.remove_edges_from(edges_to_remove) + return True + + +def visualize(points, graph, filename=None, save_figure=False, plot_indices=False, output_dir=''): + if not plot_indices: + fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5)) + if filename is not None: + fig.suptitle(filename, fontsize=20) + for point in points: + ax1.scatter(point[0], point[2], 9, c='black') + ax1.set_title('All Points') + + for node in graph.nodes(): + point = graph.nodes[node]['point'] + ax2.scatter(point[0], point[2], 9, c='black') + ax2.set_title('Naivigable Points') + + for node in graph.nodes(): + point = graph.nodes[node]['point'] + ax3.scatter(point[0], point[2], 9, c='black') + + for n1, n2 in graph.edges(): + p1 = graph.nodes[n1]['point'] + p2 = graph.nodes[n2]['point'] + ax3.plot([p1[0], p2[0]], [p1[2], p2[2]], c='green') + ax3.set_title('Connected Graph') + else: + fig, ax = plt.subplots(1, 1, figsize=(30, 20)) + if filename is not None: + fig.suptitle(filename, fontsize=20) + for node in graph.nodes(): + point = graph.nodes[node]['point'] + ax.scatter(point[0], point[2], 9, c='black') + ax.annotate(str(node), (point[0], point[2]), fontsize='x-small') + + for n1, n2 in graph.edges(): + p1 = graph.nodes[n1]['point'] + p2 = graph.nodes[n2]['point'] + ax.plot([p1[0], p2[0]], [p1[2], p2[2]], c='green') + + if save_figure: + file_path = os.path.join(output_dir, filename) + if os.path.exists(file_path): + os.remove(file_path) + plt.savefig(file_path) + else: + plt.show() + plt.close() + + +def generate_graph(points, pathfinder): + navigable_idx = [i for i, p in enumerate(points) if pathfinder.is_navigable(p)] + graph = nx.Graph() + for idx in navigable_idx: + graph.add_node(idx, point=points[idx]) + + for a_idx, a_loc in enumerate(points): + if a_idx not in navigable_idx: + continue + for b_idx, b_loc in enumerate(points): + if b_idx not in navigable_idx: + continue + if a_idx == b_idx: + continue + + euclidean_distance = np.linalg.norm(np.array(a_loc) - np.array(b_loc)) + if 0.1 < euclidean_distance < 1.01: + path = habitat_sim.ShortestPath() + path.requested_start = np.array(a_loc, dtype=np.float32) + path.requested_end = np.array(b_loc, dtype=np.float32) + pathfinder.find_path(path) + # relax the constraint a bit + if path.geodesic_distance < 1.3: + graph.add_edge(a_idx, b_idx) + + return graph + + +def main(): + metadata_folder = os.path.join('data/metadata/mp3d') + scenes = os.listdir(metadata_folder) + for scene in scenes: + navmesh_file = "data/scene_datasets/mp3d/{}/{}.navmesh".format(scene, scene) + scene_metadata_folder = os.path.join(metadata_folder, scene) + graph_file = os.path.join(scene_metadata_folder, 'graph.pkl') + visualization_dir = 'data/visualizations/mp3d' + os.makedirs(scene_metadata_folder, exist_ok=True) + os.makedirs(visualization_dir, exist_ok=True) + + pathfinder = hsim.PathFinder() + pathfinder.load_nav_mesh(navmesh_file) + points, _ = load_metadata(scene_metadata_folder) + + graph = generate_graph(points, pathfinder) + visualize(points, graph, scene, save_figure=True, plot_indices=True, output_dir=visualization_dir) + + adjusted = adjust_graph(graph, points, scene) + if adjusted: + visualize(points, graph, scene + '_fix', save_figure=True, plot_indices=True, output_dir=visualization_dir) + + with open(graph_file, 'wb') as fo: + pickle.dump(graph, fo) + + +if __name__ == '__main__': + main() diff --git a/scripts/generate_replica_metadata.py b/scripts/generate_replica_metadata.py new file mode 100644 index 0000000..fd5ac7b --- /dev/null +++ b/scripts/generate_replica_metadata.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import pickle +import os +import shutil + +import numpy as np +import networkx as nx +import matplotlib.pyplot as plt + +import habitat_sim +import habitat_sim.bindings as hsim +from soundspaces.utils import load_metadata + + +def adjust_graph(graph, points, name): + # manual fixes + if name == 'apartment_0': + upper_floor_nodes = list() + for node in graph.nodes(): + if graph.nodes[node]['point'][1] > 0.5: + upper_floor_nodes.append(node) + graph.remove_nodes_from(upper_floor_nodes) + graph.remove_nodes_from([0, 30, 31, 32, 47, 48, 49, 50, 67, 90, 173, 378, 509]) + elif name == 'apartment_2': + graph.remove_nodes_from([0, 168, 187, 218, 237]) + graph.add_node(191, point=points[191]) + graph.add_edges_from([(173, 191), (191, 209)]) + elif name == 'frl_apartment_1': + graph.remove_nodes_from([164, 177]) + elif name == 'frl_apartment_2': + graph.remove_nodes_from([174]) + elif name == 'frl_apartment_5': + graph.remove_nodes_from([9, 18, 26, 35]) + elif name == 'hotel_0': + graph.remove_nodes_from([1]) + elif name == 'office_0': + # check + graph.remove_nodes_from([6, 7]) + graph.add_edges_from([(12, 21)]) + elif name == 'office_2': + graph.remove_nodes_from([0, 11, 3, 9, 15, 10, 16]) + elif name == 'office_3': + graph.remove_nodes_from([48, 82, 115]) + # graph.add_edges_from([(56, 69)]) + elif name == 'room_0': + graph.remove_nodes_from([123, 124, 125, 126, 127, 118, 117, 102, 103, 111, 112, 120, 121]) + # graph.add_edges_from([(95, 103)]) + elif name == 'room_1': + # graph.add_edges_from([(37, 45), (51, 59)]) + graph.remove_nodes_from([45, 51]) + else: + return False + return True + + +def visualize(points, graph, filename=None, save_figure=False, plot_indices=False, output_dir=''): + if not plot_indices: + fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 5)) + if filename is not None: + fig.suptitle(filename, fontsize=20) + for point in points: + ax1.scatter(point[0], point[2], 9, c='black') + ax1.set_title('All Points') + + for node in graph.nodes(): + point = graph.nodes[node]['point'] + ax2.scatter(point[0], point[2], 9, c='black') + ax2.set_title('Naivigable Points') + + for node in graph.nodes(): + point = graph.nodes[node]['point'] + ax3.scatter(point[0], point[2], 9, c='black') + + for n1, n2 in graph.edges(): + p1 = graph.nodes[n1]['point'] + p2 = graph.nodes[n2]['point'] + ax3.plot([p1[0], p2[0]], [p1[2], p2[2]], c='green') + ax3.set_title('Connected Graph') + else: + fig, ax = plt.subplots(1, 1, figsize=(15, 10)) + if filename is not None: + fig.suptitle(filename, fontsize=20) + for node in graph.nodes(): + point = graph.nodes[node]['point'] + ax.scatter(point[0], point[2], 9, c='black') + ax.annotate(str(node), (point[0], point[2])) + for n1, n2 in graph.edges(): + p1 = graph.nodes[n1]['point'] + p2 = graph.nodes[n2]['point'] + ax.plot([p1[0], p2[0]], [p1[2], p2[2]], c='green') + + if save_figure: + file_path = os.path.join(output_dir, filename) + if os.path.exists(file_path): + os.remove(file_path) + plt.savefig(file_path) + plt.close() + + +def generate_graph(points, pathfinder): + navigable_idx = [i for i, p in enumerate(points) if pathfinder.is_navigable(p)] + graph = nx.Graph() + for idx in navigable_idx: + graph.add_node(idx, point=points[idx]) + + for a_idx, a_loc in enumerate(points): + if a_idx not in navigable_idx: + continue + for b_idx, b_loc in enumerate(points): + if b_idx not in navigable_idx: + continue + if a_idx == b_idx: + continue + + euclidean_distance = np.linalg.norm(np.array(a_loc) - np.array(b_loc)) + if 0.1 < euclidean_distance < 0.51: + path = habitat_sim.ShortestPath() + path.requested_start = np.array(a_loc, dtype=np.float32) + path.requested_end = np.array(b_loc, dtype=np.float32) + pathfinder.find_path(path) + # relax the constraint a bit + if path.geodesic_distance < 0.8: + graph.add_edge(a_idx, b_idx) + + return graph + + +def main(): + metadata_folder = os.path.join('data/metadata/replica') + scenes = os.listdir(metadata_folder) + for scene in scenes: + navmesh_file = "data/scene_datasets/replica/{}/habitat/mesh_semantic.navmesh".format(scene) + scene_metadata_folder = os.path.join(metadata_folder, scene) + graph_file = os.path.join(scene_metadata_folder, 'graph.pkl') + visualization_dir = 'data/visualizations/replica' + os.makedirs(scene_metadata_folder, exist_ok=True) + os.makedirs(visualization_dir, exist_ok=True) + + pathfinder = hsim.PathFinder() + pathfinder.load_nav_mesh(navmesh_file) + points, _ = load_metadata(scene_metadata_folder) + + graph = generate_graph(points, pathfinder) + visualize(points, graph, scene, save_figure=True, plot_indices=True, output_dir=visualization_dir) + + adjusted = adjust_graph(graph, points, scene) + if adjusted: + visualize(points, graph, scene + '_fix', save_figure=True, plot_indices=True, output_dir=visualization_dir) + + with open(graph_file, 'wb') as fo: + pickle.dump(graph, fo) + + +if __name__ == '__main__': + main() diff --git a/scripts/generate_vln_episode.py b/scripts/generate_vln_episode.py new file mode 100644 index 0000000..aa7823e --- /dev/null +++ b/scripts/generate_vln_episode.py @@ -0,0 +1,719 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +import json +import numpy as np +import gzip +import sys +import os +import math +import habitat_sim +import random +import quaternion +from soundspaces.utils import load_metadata +import networkx as nx +import numpy as np +from habitat_sim.utils.common import quat_from_angle_axis, quat_from_coeffs, quat_to_angle_axis + +# set up the paths properly + +# though it is provided in soundspace, if adjust_heading() or DummyAgent is used then this code requires matterport3d simulator (v0.1) +# path to build folder of matterport 3D simulator +PATH_2_BUILD = './ss_baselines/savi/dialog/speaker/build' +sys.path.append(PATH_2_BUILD) +import MatterSim + +# symlink connectivity dir of matterport3d sim to the root dir**** -> required for DummyAgent() and approximate_fgr2r() +ALL_SCAN_PATH = './data/binaural_rirs/mp3d' +FGR2R_DIR = './data/Fine-Grained-R2R' +VLNCE_FILE_DIR = './data/R2R_VLNCE_v1-2' +SEMANTIC_AUDIO_EPISODE_DIR = './data/datasets/semantic_audionav/mp3d/v1' +SPLIT = 'train' + +GRAPH_DIR_PATH = './data/metadata/mp3d' +VIEW2NODE_PATH = './data/view2node.json' + +DIALOG_APPROX_DATASET_PATH = './data1/datasets/semantic_audionav_dialog_approx/mp3d/v1' + +r_mat = np.array([[1, 0, 0, 0], [0, 0, 1, 0], [0, -1, 0, 0], [0, 0, 0, 1]]) +rotation_base = np.array([0, 90, 180, 270]) + + +# this class is for adjusting heading +# since the heading is not defined for intermediate nodes but subinstruction starts from their, +# we need some information of the headings in the intermediate nodes +# so we will let a dummy agent traverse a path and the heading it takes sequentially +# for coming from previous node is considered as the approximate current heading + +class DummyAgent(): + def __init__(self, path, scanId, heading): + self.scanId = scanId + self.heading = heading + self.curr_path = None + self.curr_path_id = 0 + self.path = path + self.image_w = 640 + self.image_h = 480 + self.vfov = 60 + self.sim = MatterSim.Simulator() + self.sim.setRenderingEnabled(False) + self.sim.setDiscretizedViewingAngles(True) # Set increment/decrement to 30 degree. (otherwise by radians) + self.sim.setCameraResolution(self.image_w, self.image_h) + self.sim.setCameraVFOV(math.radians(self.vfov)) + self.all_headings = [self.heading] + + def newEpisodes(self, scanId, viewpointId, heading): + self.sim.newEpisode(scanId, viewpointId, heading, 0) + self.curr_path = viewpointId + + def getHeadings(self): + # self.newEpisodes(self.scanId, self.path[0], self.heading) + + for path_idx, node in enumerate(self.path[:-1]): + # flag = False + # calculate the view which result in minimum distance + dist = np.full((36,), np.inf) + info = {} + + # need to update self.heading + for ix in range(36): + if ix == 0: + self.sim.newEpisode(self.scanId, node, self.heading, math.radians(-30)) + elif ix % 12 == 0: + self.sim.makeAction(0, 1.0, 1.0) + else: + self.sim.makeAction(0, 1.0, 0) + + state = self.sim.getState() + # print([state.heading, state.elevation]) + if len(state.navigableLocations) > 1: + # print(self.path[path_idx+1]) + if state.navigableLocations[1].viewpointId == self.path[path_idx + 1]: + dist[ix] = np.sqrt(state.navigableLocations[1].rel_heading ** 2 + state.navigableLocations[ + 1].rel_elevation ** 2) + info[ix] = [state.heading, state.elevation] + + if np.amin(dist) != np.inf: + # then it will do the heading update + # else it will use the previous heading + # assert np.amin(dist)!= np.inf, 'no value found' + idx2take = np.argmin(dist) + self.heading = info[idx2take][0] + self.sim.newEpisode(self.scanId, node, self.heading, info[idx2take][0]) + self.sim.makeAction(1, 0, 0) + state = self.sim.getState() + self.heading = state.heading # this should match the previous one, just in case + + self.all_headings.append(self.heading) + + return self.all_headings + + +def get_scans(path=None): + semantic_split_path = os.path.join(SEMANTIC_AUDIO_EPISODE_DIR, SPLIT, 'content') + scans = [] + if path is None: + for elem in os.listdir(semantic_split_path): + scans.append(elem.split('.')[0]) + else: + for elem in os.listdir(path): + scans.append(elem.split('.')[0]) + + return scans + +def cart2sph(a): + x = a[0] + y = a[1] + z = a[2] + hxy = np.hypot(x, y) + r = np.hypot(hxy, z) + el = np.arctan2(z, hxy) + az = np.arctan2(y, x) + return az, el, r + + +def get_relevant_vlnce_episodes_full_instr(): + + # for val and test case, soundspace does not have episodes based on specific scans + # so for val and test case we will collect all the vlnce episodes from + if SPLIT=='train': + scans = get_scans() + else: + scans = get_scans(path=ALL_SCAN_PATH) + + print('getting scans from {} split for storing vlnce full instr'.format(SPLIT)) + + relevant_vln_episodes = {k: [] for k in scans} + count = {} + + for split in os.listdir(VLNCE_FILE_DIR): + count[split] = 0 + vlnce_split_path = os.path.join(VLNCE_FILE_DIR, split, '{}.json.gz'.format(split)) + + with gzip.open(vlnce_split_path) as f: + json_bytes = f.read() + json_str = json_bytes.decode('utf-8') + data = json.loads(json_str) + + for idx, elem in enumerate(data['episodes']): + scene_name = elem['scene_id'].split('/')[-1].split('.')[0] + if scene_name in scans: + relevant_vln_episodes[scene_name].append(elem) + count[split] += 1 + return relevant_vln_episodes, count + + +def get_relevant_fgr2r_episodes_full_instr(): + # read the episodes from fgr2r episode + # directly create sub_instruction episode + # keep the full instruction for matching purpose + + if SPLIT=='train': + scans = get_scans() + else: + scans = get_scans(path=ALL_SCAN_PATH) + + print('getting fgr2r full instr') + + relevant_fgr2r_episodes = {k: [] for k in scans} + count = {} + + adjusted_datadir_path = os.path.join(FGR2R_DIR, 'data_adjusted') + + splits = ['train', 'val_seen', 'val_unseen'] + + for split in splits: + count[split] = 0 + with open(os.path.join(adjusted_datadir_path, 'FGR2R_{}_adjusted.json'.format(split))) as f: + all_instr_epi = json.load(f) # list + + # append episodes of sub instructions + for item in all_instr_epi: + if item['scan'] in scans: + relevant_fgr2r_episodes[item['scan']].append(item) + count[split] += 1 + + return relevant_fgr2r_episodes, count + + +def updating_fgr2r(fgr2r_epi, vlnce_epi): + # check if instruction is present in vlnce + # if present: create subinstruction based episode, + # assign proper path (from vlnce) + # assign rotation (directly or converting approximate heading) + + if SPLIT=='train': + scans = get_scans() + else: + scans = get_scans(path=ALL_SCAN_PATH) + + updated_fgr2r_epi = {k: [] for k in scans} + + for scan, episodes in fgr2r_epi.items(): + for elem_fgr2r in episodes: + assert elem_fgr2r['scan'] == scan, 'scan is not matching' + + new_instrs = eval(elem_fgr2r['new_instructions']) + for instr_idx in range(len(new_instrs)): + + # check if the instruction is available in the vlnce case + for elem_vlnce in vlnce_epi[scan]: + # prune the episodes that are not available in vlnce + if elem_fgr2r['instructions'][instr_idx] == elem_vlnce['instruction']['instruction_text']: + for sub_instr_idx, sub_instr in enumerate(new_instrs[instr_idx]): + end_points = elem_fgr2r["chunk_view"][instr_idx][sub_instr_idx] + # if this is not a stop instruction + if end_points[0] != end_points[1]: + new_item = {} + new_item['sub_instr'] = (' ').join(sub_instr) + new_item['path'] = elem_vlnce['reference_path'][(end_points[0] - 1):end_points[1]] + if end_points[0] == 1: + # convention is to use [x,y,z,w] + # elem_vlnce provides rotation as a list : [x,y,z,w] + # it matches how the rotations in episodes of soundspaces are defined + new_item['rotation'] = elem_vlnce['start_rotation'] + else: + # matching with the convention + # not using quaternion(w,x,y,z) + + heading = elem_fgr2r['all_headings'][end_points[0] - 1] + heading = heading if heading <= 3.1416 else heading - 3.1416 * 2 + # make sure: + # habitat_sim.utils.quat_from_angle_axis for probably v0.1.5 or less + # habitat_sim.utils.common.quat_from_angle_axis for >= v0.1.6 + new_item['rotation'] = habitat_sim.utils.quat_from_angle_axis(heading, + np.array([0, -1, 0])) + new_item['rotation'] = quaternion.as_float_array(new_item['rotation']).tolist() + new_item['rotation'] = new_item['rotation'][1:] + [new_item['rotation'][0]] + + new_item['scan'] = scan + # keeping view points for easy mapping from + new_item['view_points'] = elem_fgr2r['path'][(end_points[0] - 1):end_points[1]] + updated_fgr2r_epi[scan].append(new_item) + break + # results in: + # updated_fgr2r_epi = {'different_scan': [{'sub_instr': , 'path': , 'rotation': , 'scan': }]} + return updated_fgr2r_epi + + +# adjust heading and save as new file +def adjust_heading(): + # first read the json files + # test set does not have path, so ignore it + original_datadir_path = os.path.join(FGR2R_DIR, 'data') + adjusted_datadir_path = os.path.join(FGR2R_DIR, 'data_adjusted') + splits = ['train', 'val_seen', 'val_unseen'] + + for split in splits: + print('adjusting heading for {}'.format(split)) + new_all_instr = [] + with open(os.path.join(original_datadir_path, 'FGR2R_{}.json'.format(split))) as f: + all_instr_epi = json.load(f) # list + + for item in all_instr_epi: + # connectivity_path = os.path.join(CONNECTIVITY_FILE_DIR, item['scan'][2:]) + d_agent = DummyAgent(item['path'], item['scan'], item['heading']) + all_headings = d_agent.getHeadings() + new_item = dict(item) + new_item['all_headings'] = all_headings + new_all_instr.append(new_item) + + with open(os.path.join(adjusted_datadir_path, 'FGR2R_{}_adjusted.json'.format(split)), 'w') as f: + json.dump(new_all_instr, f) + + +def load_nav_graphs_vln(scans): + ''' Load connectivity graph for each scan ''' + + def distance(pose1, pose2): + ''' Euclidean distance between two graph poses ''' + return ((pose1['pose'][3] - pose2['pose'][3]) ** 2 \ + + (pose1['pose'][7] - pose2['pose'][7]) ** 2 \ + + (pose1['pose'][11] - pose2['pose'][11]) ** 2) ** 0.5 + + graphs = {} + for scan in scans: + with open('connectivity/%s_connectivity.json' % scan) as f: + G = nx.Graph() + positions = {} + data = json.load(f) + for i, item in enumerate(data): + if item['included']: + for j, conn in enumerate(item['unobstructed']): + if conn and data[j]['included']: + positions[item['image_id']] = np.array([item['pose'][3], + item['pose'][7], item['pose'][11]]); + assert data[j]['unobstructed'][i], 'Graph should be undirected' + G.add_edge(item['image_id'], data[j]['image_id'], weight=distance(item, data[j])) + nx.set_node_attributes(G, values=positions, name='position') + graphs[scan] = G + return graphs + + +def generate_view2node(): + # should generate for all scans + scans = os.listdir(GRAPH_DIR_PATH) + view2node = {} + # load the graphs of vln + vln_graphs = load_nav_graphs_vln(scans) + cnt = 0 + for scan in scans: + view2node[scan] = {} + # load soundspace graph of the scan + scan_path = os.path.join(GRAPH_DIR_PATH, scan) + _, sound_G = load_metadata(scan_path) + + # load vln graph of the scan + vln_G = vln_graphs[scan] + + # collect the location information from all the views in the vln_graph + with open('./connectivity/{}_connectivity.json'.format(scan), 'r') as f: + scan_connectivity = json.load(f) # list + + # get corresponding positions of the viewid + view_location = {} + for node_vln in vln_G.nodes(): + for view in scan_connectivity: + if node_vln == view['image_id']: + pose = np.array(view['pose']).reshape(4, 4) + pose = np.matmul(r_mat, pose) + view_location[node_vln] = np.array([pose[0, 3], pose[1, 3], pose[2, 3]]) + break + + # using vln graph so that i don't need to compute mapping for same location multiple time + for node_vln, location in view_location.items(): + dist_all = [] + node_name_sound = [] + for node_sound in sound_G.nodes(): + location_sound = np.array(sound_G.nodes[node_sound]['point']) + # print(location_sound) + if location[1] >= location_sound[1] and location[1] < location_sound[1] + 2.99: + dist = np.linalg.norm( + np.array([location[0], location[2]]) - np.array([location_sound[0], location_sound[2]])) + dist_all.append(dist) + node_name_sound.append(node_sound) + + view2node[scan][node_vln] = {} + if not len(node_name_sound) == 0: + dist_all = np.array(dist_all) + view2node[scan][node_vln]['node_name'] = node_name_sound[np.argmin(dist_all)] + view2node[scan][node_vln]['position'] = sound_G.nodes[node_name_sound[np.argmin(dist_all)]]['point'] + else: + cnt += 1 + view2node[scan][node_vln]['node_name'] = None + view2node[scan][node_vln]['position'] = None + + # print('assigned none', cnt) + with open(VIEW2NODE_PATH, 'w') as f: + json.dump(view2node, f) + + return view2node + + +def check_view2node(view2node): + # check in how many cases two view id map to same node + cnt = 0 + total_node = 0 + for scan in view2node.keys(): + all_sound_node = set() + for node_vln in view2node[scan].keys(): + all_sound_node.add(view2node[scan][node_vln]['node_name']) + cnt += (len(view2node[scan]) - len(all_sound_node)) + total_node += len(view2node[scan]) + + # total node 10191 + # 3306 difference in two set + # too much + print(cnt, total_node) + + +def approximate_fgr2r_in_soundspace(fgr2r_epi, view2node): + if SPLIT=='train': + scans = get_scans() + else: + scans = get_scans(path=ALL_SCAN_PATH) + + updated_fgr2r_epi = {} + + # also check after pruning how many episodes left + fgr2r_cnt = 0 + updated_fgr2r_cnt = 0 + + for scan in scans: + fgr2r_cnt += len(fgr2r_epi[scan]) + updated_fgr2r_epi[scan] = [] + + for episode in fgr2r_epi[scan]: + new_epi = dict(episode) + new_epi['path_node'] = [] + new_epi['path_position'] = [] + del new_epi['path'] + del new_epi['view_points'] + last_node = None + for idx, viewpoint in enumerate(episode['view_points']): + if idx > 0: + if last_node != view2node[scan][viewpoint]['node_name'] and view2node[scan][viewpoint][ + 'node_name'] != None: + new_epi['path_node'].append(view2node[scan][viewpoint]['node_name']) + new_epi['path_position'].append(view2node[scan][viewpoint]['position']) + last_node = view2node[scan][viewpoint]['node_name'] + else: + new_epi['path_node'].append(view2node[scan][viewpoint]['node_name']) + new_epi['path_position'].append(view2node[scan][viewpoint]['position']) + last_node = view2node[scan][viewpoint]['node_name'] + + if len(new_epi['path_node']) >= 2 and None not in new_epi['path_node']: + updated_fgr2r_epi[scan].append(new_epi) + + updated_fgr2r_cnt += len(updated_fgr2r_epi[scan]) + + print('fgr2r_cnt:', fgr2r_cnt, 'updated_fgr2r_cnt:', updated_fgr2r_cnt) + return updated_fgr2r_epi + + +def position_encoding(position): + return '{:.2f}_{:.2f}_{:.2f}'.format(*position) + +''' +def create_episodes(fgr2r_episodes, save=True): + # for each fgr2r create 5 samples with goal in the trajectory direction + # if the start to main goal shortest path contains language sub goal then it is included + # check if it let us have 5 samples for each fgr2r episode + + semantic_split_path = os.path.join(SEMANTIC_AUDIO_EPISODE_DIR, SPLIT, 'content') + + if SPLIT=='train': + scans = get_scans() + else: + scans = get_scans(path=ALL_SCAN_PATH) + # since the split name/ scan name does not match + # collecting all the savi episodes beforehand + scans_split = get_scans() + audionav_episodes = [] + for sound_scan in scans_split: + episode_file = os.path.join(semantic_split_path, '{}.json.gz'.format(sound_scan)) + with gzip.open(episode_file) as f: + json_bytes = f.read() + json_str = json_bytes.decode('utf-8') + audionav_episodes += json.loads(json_str)['episodes'] + + + total_fgr2r_cnt = 0 + total_gen_soundspace_cnt = 0 + all_episodes = {} + + + for scan in scans: + cnt = 0 + all_episodes[scan] = [] + # load soundspace graph of the scan + scan_path = os.path.join(GRAPH_DIR_PATH, scan) + _, sound_G = load_metadata(scan_path) + + position_to_index_mapping = dict() + for node in sound_G.nodes(): + position_to_index_mapping[position_encoding(sound_G.nodes()[node]['point'])] = node + + # create shortest path for all location + shortest_paths = dict(nx.all_pairs_dijkstra_path(sound_G)) + + # read the soundspace episode for current scan + if SPLIT=='train': + episode_file = os.path.join(semantic_split_path, '{}.json.gz'.format(scan)) + with gzip.open(episode_file) as f: + json_bytes = f.read() + json_str = json_bytes.decode('utf-8') + audionav_episodes = json.loads(json_str)['episodes'] # list + + # for each fgr2r episode check which soundspace/semantic audionav episode matches + # select randomly min(match,5) among them + # assign 3 or 4 actions (for particular rotation ofcourse) + + for fgr2r_epi in fgr2r_episodes[scan]: + # define a list of possible episode + possible_episodes = [] + for audionav_epi in audionav_episodes: + if audionav_epi['scene_id'].split('/')[0] == scan: + s_node = fgr2r_epi['path_node'][0] # starting node for dialog + d_e_node = fgr2r_epi['path_node'][-1] # ending node for dialog + final_e_node = position_to_index_mapping[ + position_encoding(audionav_epi['goals'][0]['position'])] # goal node for sound + if final_e_node in shortest_paths[s_node].keys(): + s_path = shortest_paths[s_node][final_e_node] + if (d_e_node in s_path) and len(s_path)>=7: + # form the episode, what to keep? + curr_episode = dict(audionav_epi) + curr_episode['dialog_node'] = fgr2r_epi['path_node'] + curr_episode['dialog_point'] = fgr2r_epi['path_position'] + curr_episode['sub_instr'] = fgr2r_epi['sub_instr'] + curr_episode['dialog_rotation'] = fgr2r_epi['rotation'] + # need to work with episode id too (maybe after creating all the episodes, assign id sequentially) + + possible_episodes.append(curr_episode) + + if len(possible_episodes) > 5: + possible_episodes = random.sample(possible_episodes, 5) + cnt += 1 + + all_episodes[scan] += possible_episodes + + total_fgr2r_cnt += len(fgr2r_episodes[scan]) + total_gen_soundspace_cnt += len(all_episodes[scan]) + print('number of approximated fgr2r_episodes for scan {}: {}'.format(scan, len(fgr2r_episodes[scan]))) + print( + 'number of generated soundspace episodes from fgr2r for scan {}: {}'.format(scan, len(all_episodes[scan]))) + print('number of time more than 5 possible episodes can be created for scan {}: {}'.format(scan, cnt)) + + if save: + # to save file + if not os.path.exists(os.path.join(DIALOG_APPROX_DATASET_PATH, SPLIT, 'content')): + os.makedirs(os.path.join(DIALOG_APPROX_DATASET_PATH, SPLIT, 'content')) + + corresponding_dict = {'episodes': all_episodes[scan], 'scan': scan} + json_str = json.dumps(corresponding_dict) + json_bytes = json_str.encode('utf-8') + with gzip.open(os.path.join(DIALOG_APPROX_DATASET_PATH, SPLIT, 'content', '{}.json.gz'.format(scan)), + 'w') as f: + f.write(json_bytes) + + print('total_fgr2r_cnt, total_gen_soundspace_cnt', total_fgr2r_cnt, total_gen_soundspace_cnt) +''' + +def create_episodes_dialog_start(fgr2r_episodes, save=True): + # for each fgr2r create 5 samples with goal in the trajectory direction + # difference with create_episodes: starting position and location is the dialog position and location + # if the start to main goal shortest path contains language sub goal then it is included + # check if it let us have 5 samples for each fgr2r episode + + semantic_split_path = os.path.join(SEMANTIC_AUDIO_EPISODE_DIR, SPLIT, 'content') + + if SPLIT=='train': + scans = get_scans() + else: + scans = get_scans(path=ALL_SCAN_PATH) + # since the split name/ scan name does not match + # collecting all the savi episodes beforehand + scans_split = get_scans() + audionav_episodes = [] + for sound_scan in scans_split: + episode_file = os.path.join(semantic_split_path, '{}.json.gz'.format(sound_scan)) + with gzip.open(episode_file) as f: + json_bytes = f.read() + json_str = json_bytes.decode('utf-8') + audionav_episodes += json.loads(json_str)['episodes'] + + total_fgr2r_cnt = 0 + total_gen_soundspace_cnt = 0 + all_episodes = {} + for scan in scans: + cnt = 0 + episode_id = 0 + all_episodes[scan] = [] + # load soundspace graph of the scan + scan_path = os.path.join(GRAPH_DIR_PATH, scan) + _, sound_G = load_metadata(scan_path) + + position_to_index_mapping = dict() + for node in sound_G.nodes(): + position_to_index_mapping[position_encoding(sound_G.nodes()[node]['point'])] = node + + # create shortest path for all location + shortest_paths = dict(nx.all_pairs_dijkstra_path(sound_G)) + + # print(shortest_paths.keys()) + + # read the soundspace episode for current scan + if SPLIT=='train': + episode_file = os.path.join(semantic_split_path, '{}.json.gz'.format(scan)) + with gzip.open(episode_file) as f: + json_bytes = f.read() + json_str = json_bytes.decode('utf-8') + audionav_episodes = json.loads(json_str)['episodes'] # list + + # for each fgr2r episode check which soundspace/semantic audionav episode matches + # select randomly min(match,5) among them + # assign 3 or 4 actions (for particular rotation ofcourse) + + for fgr2r_epi in fgr2r_episodes[scan]: + # define a list of possible episode + possible_episodes = [] + for audionav_epi in audionav_episodes: + if audionav_epi['scene_id'].split('/')[0] == scan: + s_node = fgr2r_epi['path_node'][0] # starting node for dialog + d_e_node = fgr2r_epi['path_node'][-1] # ending node for dialog + + # calculate directional info + # angles: alpha: X-Z, beta: + s_node_pos = sound_G.nodes[s_node]['point'] + s_node_pos = np.array( [s_node_pos[0], -s_node_pos[2], s_node_pos[1]]) + d_e_node_pos = sound_G.nodes[d_e_node]['point'] + d_e_node_pos = np.array( [d_e_node_pos[0], -d_e_node_pos[2], d_e_node_pos[1]]) + az, el, _ = cart2sph(d_e_node_pos - s_node_pos) # el is elevation: arctan2(z/hxy) not conventional + + final_e_node = position_to_index_mapping[ + position_encoding(audionav_epi['goals'][0]['position'])] # goal node for sound + if final_e_node in shortest_paths[s_node].keys(): + s_path = shortest_paths[s_node][final_e_node] + if (d_e_node in s_path) and len(s_path)>=7: + # form the episode, what to keep? + curr_episode = dict(audionav_epi) + curr_episode['dialog_node'] = fgr2r_epi['path_node'] + curr_episode['start_position'] = fgr2r_epi['path_position'][0] + curr_episode['sub_instr'] = fgr2r_epi['sub_instr'] + curr_episode['direction'] = [az, el] + + # rotation needs to be processed + rotation_angle = int(np.around(np.rad2deg(quat_to_angle_axis(quat_from_coeffs(fgr2r_epi['rotation']))[0]))) % 360 + updated_rotation_angle = int(rotation_base[np.argmin(abs(rotation_angle-rotation_base))]) + rotation_quat = quat_from_angle_axis(np.deg2rad(updated_rotation_angle), np.array([0, 1, 0])) + rotation_quat_list = quaternion.as_float_array(rotation_quat).tolist() + curr_episode['start_rotation'] = rotation_quat_list + curr_episode['rotation_angle'] = updated_rotation_angle + + # need to work with episode_id too (maybe after creating all the episodes, assign id sequentially) + curr_episode['episode_id'] = episode_id + episode_id += 1 + possible_episodes.append(curr_episode) + + if len(possible_episodes) > 5: + possible_episodes = random.sample(possible_episodes, 5) + cnt += 1 + + all_episodes[scan] += possible_episodes + + total_fgr2r_cnt += len(fgr2r_episodes[scan]) + total_gen_soundspace_cnt += len(all_episodes[scan]) + print('number of approximated fgr2r_episodes for scan {}: {}'.format(scan, len(fgr2r_episodes[scan]))) + print( + 'number of generated soundspace episodes from fgr2r for scan {}: {}'.format(scan, len(all_episodes[scan]))) + print('number of time more than 5 possible episodes can be created for scan {}: {}'.format(scan, cnt)) + + if save and len(all_episodes[scan])>0: + # to save file + if not os.path.exists(os.path.join(DIALOG_APPROX_DATASET_PATH, SPLIT, 'content')): + os.makedirs(os.path.join(DIALOG_APPROX_DATASET_PATH, SPLIT, 'content')) + + corresponding_dict = {'episodes': all_episodes[scan], 'scan': scan} + json_str = json.dumps(corresponding_dict) + json_bytes = json_str.encode('utf-8') + with gzip.open(os.path.join(DIALOG_APPROX_DATASET_PATH, SPLIT, 'content', '{}.json.gz'.format(scan)), + 'w') as f: + f.write(json_bytes) + + print('total_fgr2r_cnt, total_gen_soundspace_cnt', total_fgr2r_cnt, total_gen_soundspace_cnt) + + +def check_episodes(): + file_path = './data/datasets/semantic_audionav_dialog_approx/mp3d/v1/train/content/VzqfbhrpDEA.json.gz' + with gzip.open(file_path) as f: + json_bytes = f.read() + + json_str = json_bytes.decode('utf-8') + data = json.loads(json_str) + print(data.keys()) + print(data['episodes'][0]) + + +# run: python scripts/generate_vln_episode.py +if __name__ == '__main__': + + adjusted_datadir_path = os.path.join(FGR2R_DIR, 'data_adjusted') + if not os.path.isdir(adjusted_datadir_path): + os.makedirs(adjusted_datadir_path) + + # if the FGR2R dataset with adjusted heading is not available then generate files + if not os.path.isdir(os.path.join(FGR2R_DIR, 'data_adjusted')): + adjust_heading() + + # now based on the scans of soundspace, gather episodes from vlnce + relevant_vlnce_episodes, count_vlnce = get_relevant_vlnce_episodes_full_instr() + + # gather all relevant episodes from FGR2R + # not all instructions are accessible in continuous case + # so need to prune FGR2R episodes based on vlnce + + # first gather all the instructions with adjusted heading of fgr2r + relevant_fgr2r_episodes, count_fgr2r = get_relevant_fgr2r_episodes_full_instr() + + # now create_subinstruction, prune based on vlnce and update the rotation information + updated_fgr2r_epi = updating_fgr2r(relevant_fgr2r_episodes, relevant_vlnce_episodes) + + # create view2node that maps vlnce viewid to nodes/location in soundspace + if not os.path.isfile(VIEW2NODE_PATH): + view2node = generate_view2node() + else: + with open(VIEW2NODE_PATH, 'r') as f: + view2node = json.load(f) + + # now convert views of each vlnce episode in soundspace grid + # episodes that contains None values in mapping should be ignored + # found 470 nodes in all the scans that results in None + + appr_fgr2r_epi = approximate_fgr2r_in_soundspace(updated_fgr2r_epi, view2node) + + # create_episodes(appr_fgr2r_epi) # ****** i5noydFURQK 2n8kARJN3HM HxpKQynjfin had zero episodes + + create_episodes_dialog_start(appr_fgr2r_epi) # ****** i5noydFURQK 2n8kARJN3HM HxpKQynjfin had zero episodes + + # check_episodes() diff --git a/scripts/interactive_demo.py b/scripts/interactive_demo.py new file mode 100644 index 0000000..ac85130 --- /dev/null +++ b/scripts/interactive_demo.py @@ -0,0 +1,246 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import sys +import logging +import argparse + +import numpy as np +import pygame + +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat.utils.visualizations import maps +from habitat.datasets import make_dataset +from habitat.utils.visualizations.utils import observations_to_image +import soundspaces +from ss_baselines.common.environments import AudioNavRLEnv +from ss_baselines.common.utils import images_to_video_with_audio +from ss_baselines.av_nav.config import get_config + + +def draw_top_down_map(info): + top_down_map = info["top_down_map"]["map"] + + top_down_map = maps.colorize_topdown_map(top_down_map) + map_agent_pos = info["top_down_map"]["agent_map_coord"] + top_down_map = maps.draw_agent( + image=top_down_map, + agent_center_coord=map_agent_pos, + agent_rotation=info["top_down_map"]["agent_angle"], + agent_radius_px=top_down_map.shape[0] // 25, + ) + + return top_down_map + + +def interactive_demo(config, env): + # Set the width and height of the screen [width, height] + pygame.init() + size = (728, 256) + screen = pygame.display.set_mode(size) + pygame.display.set_caption("Interactive Demo") + # Loop until the user clicks the close button. + done = False + # Used to manage how fast the screen updates + clock = pygame.time.Clock() + + frames = list() + audios = list() + observation = env.reset() + rgb_image = np.swapaxes(observation['rgb'], 0, 1) + # screen.blit(pygame.surfarray.make_surface(rgb_image), (0, 0)) + pygame.display.flip() + # -------- Main Program Loop ----------- + keys = [] + while not done: + # --- Main event loop + def wait(): + while True: + for event in pygame.event.get(): + if event.type == pygame.QUIT: + pygame.quit() + sys.exit() + if event.type == pygame.KEYDOWN: + action = None + if event.key == pygame.K_w: # w + action = HabitatSimActions.MOVE_FORWARD + keys.append('w') + elif event.key == pygame.K_a: # a + action = HabitatSimActions.TURN_LEFT + keys.append('a') + elif event.key == pygame.K_d: # d + action = HabitatSimActions.TURN_RIGHT + keys.append('d') + elif event.key == pygame.K_f: # f + action = HabitatSimActions.STOP + keys.append('f') + if action is not None: + return action + + action = wait() + # --- Game logic should go here + observation, reward, done, info = env.step(**{'action': action}) + if env.get_done(None): + # observation = env.reset() + break + + if config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE and 'intermediate' in observation: + for obs in observation['intermediate']: + frame = observations_to_image(obs, info) + frames.append(frame) + frame = observations_to_image(observation, info) + frames.append(frame) + frame = np.swapaxes(frame, 0, 1) + audio = observation['audiogoal'] + audios.append(audio) + + # Here, we clear the screen to white. Don't put other drawing commands + # above this, or they will be erased with this command. + screen.fill((255, 255, 255)) + screen.blit(pygame.surfarray.make_surface(frame), (0, 0)) + # smaller_frame = block_reduce(frame, block_size=(down_sampling, down_sampling, 1), func=np.mean) + # screen.blit(pygame.surfarray.make_surface(smaller_frame), (0, 0)) + + # play sound + # temp_file = 'data/temp/temp.wav' + # sr = config.TASK_CONFIG.SIMULATOR.AUDIO.RIR_SAMPLING_RATE + # audio = np.int16(audio * 32767).T + # wavfile.write(temp_file, sr, audio) + # pygame.mixer.music.load(temp_file) + # pygame.mixer.music.play(-1) + + # --- Go ahead and update the screen with what we've drawn. + pygame.display.flip() + + # --- Limit to 60 frames per second + clock.tick(1) + + # Close the window and quit. + pygame.quit() + env.close() + print('Keys: {}'.format(','.join(keys))) + + # write frames and audio into videos + video_dir = 'data/visualizations/demo' + video_name = 'demo' + fps = config.TASK_CONFIG.SIMULATOR.VIEW_CHANGE_FPS \ + if config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE else 1 + images_to_video_with_audio(frames, video_dir, video_name, audios, + sr=config.TASK_CONFIG.SIMULATOR.AUDIO.RIR_SAMPLING_RATE, fps=fps) + + +def following(config, env, keys): + observation = env.reset() + frames = list() + audios = list() + for key in keys: + if key == 'w': # w + action = HabitatSimActions.MOVE_FORWARD + elif key == 'a': # a + action = HabitatSimActions.TURN_LEFT + elif key == 'd': # d + action = HabitatSimActions.TURN_RIGHT + elif key == 'f': # f + action = HabitatSimActions.STOP + + # --- Game logic should go here + observation, reward, done, info = env.step(**{'action': action}) + if env.get_done(None): + break + + if config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE and 'intermediate' in observation: + for obs in observation['intermediate']: + frame = observations_to_image(obs, info) + frames.append(frame) + frame = observations_to_image(observation, info) + frames.append(frame) + audio = observation['audiogoal'] + audios.append(audio) + + env.close() + + # write frames and audio into videos + video_dir = 'data/visualizations/demo' + video_name = 'demo' + fps = config.TASK_CONFIG.SIMULATOR.VIEW_CHANGE_FPS \ + if config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE else 1 + images_to_video_with_audio(frames, video_dir, video_name, audios, + sr=config.TASK_CONFIG.SIMULATOR.AUDIO.RIR_SAMPLING_RATE, fps=fps) + + +def main(): + # import os + # os.environ["SDL_VIDEODRIVER"] = "dummy" + parser = argparse.ArgumentParser() + # parser.add_argument('--sound', default=False, action='store_true') + parser.add_argument( + "--run-type", + choices=["train", "eval"], + default='eval', + help="run type of the experiment (train or eval)", + ) + parser.add_argument( + "--exp-config", + type=str, + required=False, + default='ss_baselines/av_nav/config/audionav/mp3d/interactive_demo.yaml', + help="path to config yaml containing info about experiment", + ) + parser.add_argument( + "opts", + default=None, + nargs=argparse.REMAINDER, + help="Modify config options from command line", + ) + parser.add_argument( + "--debug", + default=False, + action='store_true', + help="Modify config options from command line", + ) + parser.add_argument( + "--keys", + default='', + type=str, + help="Modify config options from command line", + ) + args = parser.parse_args() + + # file_handler = logging.FileHandler(log_file, mode=mode) + stdout_handler = logging.StreamHandler(sys.stdout) + level = logging.INFO if not args.debug else logging.DEBUG + logging.basicConfig(level=level, handlers=[stdout_handler], + format='%(asctime)s, %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") + + config = get_config( + config_paths=args.exp_config, + opts=args.opts, + run_type=args.run_type) + config.defrost() + config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT + if args.keys == '': + config.TASK_CONFIG.SIMULATOR.RGB_SENSOR.WIDTH = config.TASK_CONFIG.SIMULATOR.RGB_SENSOR.HEIGHT = \ + config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.WIDTH = config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.HEIGHT = 256 + config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE = False + else: + config.TASK_CONFIG.TASK.TOP_DOWN_MAP.DRAW_GOAL_POSITIONS = False + config.freeze() + print(config) + dataset = make_dataset(id_dataset=config.TASK_CONFIG.DATASET.TYPE, config=config.TASK_CONFIG.DATASET) + env = AudioNavRLEnv(config=config, dataset=dataset) + + if args.keys == '': + interactive_demo(config, env) + else: + keys = args.keys.split(',') + following(config, env, keys) + + +if __name__ == '__main__': + main() diff --git a/scripts/unzipping_skybox_image.py b/scripts/unzipping_skybox_image.py new file mode 100644 index 0000000..8b37e52 --- /dev/null +++ b/scripts/unzipping_skybox_image.py @@ -0,0 +1,21 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +import zipfile +import os + + +base_path = '/home/sudipta/drive/audiogoal/v1/scans/' +base_extract_path = '/home/sudipta/drive/audiogoal/v1/scans/' + +scans = os.listdir(os.path.join(base_path)) + +for idx, scan in enumerate(scans): + print('working on {}/{}'.formayt(idx+1, len(scans))) + path_to_zip_file = os.path.join(base_path, scan, 'matterport_skybox_images.zip') + with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref: + zip_ref.extractall(base_path) + + + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7de6439 --- /dev/null +++ b/setup.py @@ -0,0 +1,58 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from setuptools import setup + + +setup( + name='sound-spaces', + version='0.1.1', + packages=[ + 'soundspaces', + 'ss_baselines' + ], + install_requires=[ + 'torch', + 'gym', + 'numpy>=1.16.1', + 'yacs>=0.1.5', + 'numpy-quaternion>=2019.3.18.14.33.20', + 'attrs>=19.1.0', + 'opencv-python>=3.3.0', + 'imageio>=2.2.0', + 'imageio-ffmpeg>=0.2.0', + 'scipy>=1.0.0', + 'tqdm>=4.0.0', + 'numba', + 'Pillow', + 'pydub', + 'getch', + 'matplotlib', + 'librosa', + 'torchsummary', + 'gitpython', + 'networkx', + 'tqdm', + 'notebook', + 'moviepy', + 'tensorflow', + 'astropy', + 'scikit-image', + 'torchvision', + 'torchtext', + 'tensorboardX', + 'nltk', + 'pandas', + 'pynvml' + ], + extras_require={ + 'test': [ + 'pylint', + 'pytest', + ], + }, +) diff --git a/soundspaces/README.md b/soundspaces/README.md new file mode 100644 index 0000000..874c0cd --- /dev/null +++ b/soundspaces/README.md @@ -0,0 +1,82 @@ +# SoundSpaces Dataset + +## Overview +The SoundSpaces dataset includes audio renderings (room impulse responses) for two datasets, metadata of each scene, episode datasets and mono sound files. + +## Linking the Required Files +0. Create a folder named "data" under $ROOT directory +1. use command 'ln -s /data2/datasets/tmp/supaul2/soundspace/to_share/data/* $ROOT/data/' to symlink all the following required datasets. +2. Done. Check if it matches with the data folder structure give below. +3. Replace `./data/datasets/semantic_audionav_dialog_approx` with [semantic_audionav_dialog_approx](https://drive.google.com/drive/folders/1N4i-vj_ZsH9g8NDII6iUErie46YZZSWi?usp=sharing) + + +## Download +0. Create a folder named "data" under $ROOT directory +1. Download [Matterport3D](https://niessner.github.io/Matterport). Or clone it from `/data2/datasets/tmp/supaul2/mp3d_habitat/v1/tasks/mp3d/`. +Keep it inside `$ROOT/data/scene_datasets` folder (follow the data folder structure). +2. Run the commands below in the **data** directory to download partial binaural RIRs (867G), metadata (1M), datasets (77M) and sound files (13M). Note that this partial binaural RIRs only contain renderings for nodes accessible by the agent on the navigation graph. +the dowloaded files are available in `/data2/datasets/tmp/supaul2/soundspace/` +``` +wget http://dl.fbaipublicfiles.com/SoundSpaces/binaural_rirs.tar && tar xvf binaural_rirs.tar +wget http://dl.fbaipublicfiles.com/SoundSpaces/metadata.tar.xz && tar xvf metadata.tar.xz +wget http://dl.fbaipublicfiles.com/SoundSpaces/sounds.tar.xz && tar xvf sounds.tar.xz +wget http://dl.fbaipublicfiles.com/SoundSpaces/datasets.tar.xz && tar xvf datasets.tar.xz +wget http://dl.fbaipublicfiles.com/SoundSpaces/pretrained_weights.tar.xz && tar xvf pretrained_weights.tar.xz +``` +3. Run the command below in the root directory to cache observations for two datasets +``` +python scripts/cache_observations.py +``` +4. copy/symlink node2view.json from `/data2/datasets/tmp/supaul2/soundspace/` +5. copy/symlink view2node.json from `/data2/datasets/tmp/supaul2/soundspace/` +6. (Optional) Download the full ambisonic (3.6T for Matterport) and binaural (682G for Matterport and 81G for Replica) RIRs data by running the following script in the root directory. Remember to first back up the downloaded bianural RIR data. +``` +python scripts/download_data.py --dataset mp3d --rir-type binaural_rirs +python scripts/download_data.py --dataset replica --rir-type binaural_rirs +``` + + + +## Data Folder Structure +``` + . + ├── ... + ├── metadata # stores metadata of environments + │ └── [dataset] + │ └── [scene] + │ ├── point.txt # coordinates of all points in mesh coordinates + │ ├── graph.pkl # points are pruned to a connectivity graph + ├── binaural_rirs # binaural RIRs of 2 channels + │ └── [dataset] + │ └── [scene] + │ └── [angle] # azimuth angle of agent's heading in mesh coordinates + │ └── [receiver]-[source].wav + ├── datasets # stores datasets of episodes of different splits + │ └── [dataset] + │ └── [version] + │ └── [split] + │ ├── [split].json.gz + │ └── content + │ └── [scene].json.gz + ├── sounds # stores all 102 copyright-free sounds + │ └── 1s_all + │ └── [sound].wav + ├── scene_datasets # scene_datasets + │ └── [dataset] + │ └── [scene] + │ └── [scene].house (habitat/mesh_sementic.glb) + ├── scene_observations # pre-rendered scene observations + │ └── [dataset] + │ └── [scene].pkl # dictionary is in the format of {(receiver, rotation): sim_obs} + ├── pretrained_weights # weights provided by soundspace (not giving proper result) + │ └── [dataset] + │ └── savi + │ └── [].pth + ├── node2view.json + └── view2node.json +``` + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 diff --git a/soundspaces/__init__.py b/soundspaces/__init__.py new file mode 100644 index 0000000..c9872e7 --- /dev/null +++ b/soundspaces/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from soundspaces.tasks.action_space import SoundspacesDialogSimV0ActionSpaceConfiguration +from soundspaces.simulator import SoundSpacesSim +from soundspaces.datasets.audionav_dataset import AudioNavDataset +from soundspaces.datasets.semantic_audionav_dataset import SemanticAudioNavDataset +from soundspaces.tasks.audionav_task import AudioNavigationTask +from soundspaces.tasks.semantic_audionav_task import SemanticAudioNavigationTask +from soundspaces.tasks.nav import AudioGoalSensor +from soundspaces.tasks.nav import SpectrogramSensor +from soundspaces.tasks.nav import Collision +from soundspaces.challenge import Challenge +from soundspaces.benchmark import Benchmark diff --git a/soundspaces/benchmark.py b/soundspaces/benchmark.py new file mode 100644 index 0000000..fd0b334 --- /dev/null +++ b/soundspaces/benchmark.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +r"""Implements evaluation of ``habitat.Agent`` inside ``habitat.Env``. +``habitat.Benchmark`` creates a ``habitat.Env`` which is specified through +the ``config_env`` parameter in constructor. The evaluation is task agnostic +and is implemented through metrics defined for ``habitat.EmbodiedTask``. +""" + +import os +from collections import defaultdict +from typing import Dict, Optional + +from habitat.core.agent import Agent +from habitat.core.env import Env +from ss_baselines.av_nav.config import get_task_config + + +class Benchmark: + r"""Benchmark for evaluating agents in environments.""" + + def __init__( + self, config_paths: Optional[str] = None, eval_remote: bool = False + ) -> None: + r""".. + + :param config_paths: file to be used for creating the environment + :param eval_remote: boolean indicating whether evaluation should be run remotely or locally + """ + config_env = get_task_config(config_paths) + self._eval_remote = eval_remote + + if self._eval_remote is True: + self._env = None + else: + self._env = Env(config=config_env) + + def remote_evaluate( + self, agent: "Agent", num_episodes: Optional[int] = None + ): + # The modules imported below are specific to habitat-challenge remote evaluation. + # These modules are not part of the habitat-lab repository. + import pickle + import time + + import evalai_environment_habitat # noqa: F401 + import evaluation_pb2 + import evaluation_pb2_grpc + import grpc + + time.sleep(60) + + def pack_for_grpc(entity): + return pickle.dumps(entity) + + def unpack_for_grpc(entity): + return pickle.loads(entity) + + def remote_ep_over(stub): + res_env = unpack_for_grpc( + stub.episode_over(evaluation_pb2.Package()).SerializedEntity + ) + return res_env["episode_over"] + + env_address_port = os.environ.get("EVALENV_ADDPORT", "localhost:8085") + channel = grpc.insecure_channel(env_address_port) + stub = evaluation_pb2_grpc.EnvironmentStub(channel) + + base_num_episodes = unpack_for_grpc( + stub.num_episodes(evaluation_pb2.Package()).SerializedEntity + ) + num_episodes = base_num_episodes["num_episodes"] + + agg_metrics: Dict = defaultdict(float) + + count_episodes = 0 + + while count_episodes < num_episodes: + agent.reset() + res_env = unpack_for_grpc( + stub.reset(evaluation_pb2.Package()).SerializedEntity + ) + + while not remote_ep_over(stub): + obs = res_env["observations"] + action = agent.act(obs) + + res_env = unpack_for_grpc( + stub.act_on_environment( + evaluation_pb2.Package( + SerializedEntity=pack_for_grpc(action) + ) + ).SerializedEntity + ) + + metrics = unpack_for_grpc( + stub.get_metrics( + evaluation_pb2.Package( + SerializedEntity=pack_for_grpc(action) + ) + ).SerializedEntity + ) + + for m, v in metrics["metrics"].items(): + agg_metrics[m] += v + count_episodes += 1 + + avg_metrics = {k: v / count_episodes for k, v in agg_metrics.items()} + + stub.evalai_update_submission(evaluation_pb2.Package()) + + return avg_metrics + + def local_evaluate( + self, agent: "Agent", num_episodes: Optional[int] = None + ) -> Dict[str, float]: + if num_episodes is None: + num_episodes = len(self._env.episodes) + else: + assert num_episodes <= len(self._env.episodes), ( + "num_episodes({}) is larger than number of episodes " + "in environment ({})".format( + num_episodes, len(self._env.episodes) + ) + ) + + assert num_episodes > 0, "num_episodes should be greater than 0" + + agg_metrics: Dict = defaultdict(float) + + count_episodes = 0 + while count_episodes < num_episodes: + agent.reset() + observations = self._env.reset() + + while not self._env.episode_over: + action = agent.act(observations) + observations = self._env.step(action) + + metrics = self._env.get_metrics() + for m, v in metrics.items(): + agg_metrics[m] += v + count_episodes += 1 + + avg_metrics = {k: v / count_episodes for k, v in agg_metrics.items()} + + return avg_metrics + + def evaluate( + self, agent: "Agent", num_episodes: Optional[int] = None + ) -> Dict[str, float]: + r""".. + + :param agent: agent to be evaluated in environment. + :param num_episodes: count of number of episodes for which the + evaluation should be run. + :return: dict containing metrics tracked by environment. + """ + + if self._eval_remote is True: + return self.remote_evaluate(agent, num_episodes) + else: + return self.local_evaluate(agent, num_episodes) diff --git a/soundspaces/challenge.py b/soundspaces/challenge.py new file mode 100644 index 0000000..c1a4981 --- /dev/null +++ b/soundspaces/challenge.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os + +from habitat.core.logging import logger +from soundspaces.benchmark import Benchmark + + +class Challenge(Benchmark): + def __init__(self, eval_remote=False): + config_paths = os.environ["CHALLENGE_CONFIG_FILE"] + super().__init__(config_paths, eval_remote=eval_remote) + + def submit(self, agent): + metrics = super().evaluate(agent) + for k, v in metrics.items(): + logger.info("{}: {}".format(k, v)) diff --git a/soundspaces/datasets/__init__.py b/soundspaces/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/soundspaces/datasets/audionav_dataset.py b/soundspaces/datasets/audionav_dataset.py new file mode 100644 index 0000000..4441645 --- /dev/null +++ b/soundspaces/datasets/audionav_dataset.py @@ -0,0 +1,160 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import gzip +import json +import os +import logging +from typing import List, Optional + +from habitat.config import Config +from habitat.core.dataset import Dataset +from habitat.core.registry import registry +from habitat.tasks.nav.nav import ( + NavigationEpisode, + NavigationGoal, + ShortestPathPoint, +) + + +ALL_SCENES_MASK = "*" +CONTENT_SCENES_PATH_FIELD = "content_scenes_path" +DEFAULT_SCENE_PATH_PREFIX = "data/scene_dataset/" + + +@registry.register_dataset(name="AudioNav") +class AudioNavDataset(Dataset): + r"""Class inherited from Dataset that loads Audio Navigation dataset. + """ + + episodes: List[NavigationEpisode] + content_scenes_path: str = "{data_path}/content/{scene}.json.gz" + + @staticmethod + def check_config_paths_exist(config: Config) -> bool: + return os.path.exists( + config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + ) and os.path.exists(config.SCENES_DIR) + + @staticmethod + def get_scenes_to_load(config: Config) -> List[str]: + r"""Return list of scene ids for which dataset has separate files with + episodes. + """ + assert AudioNavDataset.check_config_paths_exist(config), \ + (config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT), config.SCENES_DIR) + dataset_dir = os.path.dirname( + config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + ) + + cfg = config.clone() + cfg.defrost() + cfg.CONTENT_SCENES = [] + dataset = AudioNavDataset(cfg) + return AudioNavDataset._get_scenes_from_folder( + content_scenes_path=dataset.content_scenes_path, + dataset_dir=dataset_dir, + ) + + @staticmethod + def _get_scenes_from_folder(content_scenes_path, dataset_dir): + scenes = [] + content_dir = content_scenes_path.split("{scene}")[0] + scene_dataset_ext = content_scenes_path.split("{scene}")[1] + content_dir = content_dir.format(data_path=dataset_dir) + if not os.path.exists(content_dir): + return scenes + + for filename in os.listdir(content_dir): + if filename.endswith(scene_dataset_ext): + scene = filename[: -len(scene_dataset_ext)] + scenes.append(scene) + scenes.sort() + return scenes + + def __init__(self, config: Optional[Config] = None) -> None: + self.episodes = [] + self._config = config + + if config is None: + return + + datasetfile_path = config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + with gzip.open(datasetfile_path, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR, scene_filename=datasetfile_path) + + # Read separate file for each scene + dataset_dir = os.path.dirname(datasetfile_path) + scenes = config.CONTENT_SCENES + if ALL_SCENES_MASK in scenes: + scenes = AudioNavDataset._get_scenes_from_folder( + content_scenes_path=self.content_scenes_path, + dataset_dir=dataset_dir, + ) + + last_episode_cnt = 0 + for scene in scenes: + scene_filename = self.content_scenes_path.format( + data_path=dataset_dir, scene=scene + ) + with gzip.open(scene_filename, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR, scene_filename=scene_filename) + + num_episode = len(self.episodes) - last_episode_cnt + last_episode_cnt = len(self.episodes) + logging.info('Sampled {} from {}'.format(num_episode, scene)) + + def filter_by_ids(self, scene_ids): + episodes_to_keep = list() + + for episode in self.episodes: + for scene_id in scene_ids: + scene, ep_id = scene_id.split(',') + if scene in episode.scene_id and ep_id == episode.episode_id: + episodes_to_keep.append(episode) + + self.episodes = episodes_to_keep + + # filter by scenes for data collection + def filter_by_scenes(self, scene): + episodes_to_keep = list() + + for episode in self.episodes: + episode_scene = episode.scene_id.split("/")[3] + if scene == episode_scene: + episodes_to_keep.append(episode) + + self.episodes = episodes_to_keep + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None, scene_filename: Optional[str] = None + ) -> None: + deserialized = json.loads(json_str) + if CONTENT_SCENES_PATH_FIELD in deserialized: + self.content_scenes_path = deserialized[CONTENT_SCENES_PATH_FIELD] + + episode_cnt = 0 + for episode in deserialized["episodes"]: + episode = NavigationEpisode(**episode) + + if scenes_dir is not None: + if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): + episode.scene_id = episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX): + ] + + episode.scene_id = os.path.join(scenes_dir, episode.scene_id) + + for g_index, goal in enumerate(episode.goals): + episode.goals[g_index] = NavigationGoal(**goal) + if episode.shortest_paths is not None: + for path in episode.shortest_paths: + for p_index, point in enumerate(path): + path[p_index] = ShortestPathPoint(**point) + self.episodes.append(episode) + episode_cnt += 1 diff --git a/soundspaces/datasets/semantic_audionav_dataset.py b/soundspaces/datasets/semantic_audionav_dataset.py new file mode 100644 index 0000000..17aea44 --- /dev/null +++ b/soundspaces/datasets/semantic_audionav_dataset.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# The MIT license below is in the original source at https://github.com/facebookresearch/sound-spaces/blob/main/soundspaces/datasets/semantic_audionav_dataset.py +# although the sound-spaces package is licensed as CC-BY-4.0 + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +from typing import Any, Dict, List, Optional +import logging +import gzip +import sys + +from habitat.config import Config +from habitat.core.registry import registry +from habitat.core.simulator import AgentState, ShortestPathPoint +from habitat.core.dataset import Dataset +from habitat.core.utils import DatasetFloatJSONEncoder +from soundspaces.tasks.semantic_audionav_task import SemanticAudioGoalNavEpisode, SemanticAudioGoal, ObjectViewLocation + +ALL_SCENES_MASK = "*" +CONTENT_SCENES_PATH_FIELD = "content_scenes_path" +DEFAULT_SCENE_PATH_PREFIX = "data/scene_dataset/" + + +@registry.register_dataset(name="SemanticAudioNav") +class SemanticAudioNavDataset(Dataset): + category_to_task_category_id: Dict[str, int] + category_to_scene_annotation_category_id: Dict[str, int] + episodes: List[SemanticAudioGoalNavEpisode] + content_scenes_path: str = "{data_path}/content/{scene}.json.gz" + goals_by_category: Dict[str, List[SemanticAudioGoal]] + + @staticmethod + def check_config_paths_exist(config: Config) -> bool: + return os.path.exists( + config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + ) and os.path.exists(config.SCENES_DIR) + + @staticmethod + def get_scenes_to_load(config: Config, **kwargs) -> List[str]: + r"""Return list of scene ids for which dataset has separate files with + episodes. + """ + assert SemanticAudioNavDataset.check_config_paths_exist(config), \ + (config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT), config.SCENES_DIR) + dataset_dir = os.path.dirname( + config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + ) + + cfg = config.clone() + cfg.defrost() + cfg.CONTENT_SCENES = [] + dataset = SemanticAudioNavDataset(cfg) + return SemanticAudioNavDataset._get_scenes_from_folder( + content_scenes_path=dataset.content_scenes_path, + dataset_dir=dataset_dir, + ) + + @staticmethod + def _get_scenes_from_folder(content_scenes_path, dataset_dir): + scenes = [] + content_dir = content_scenes_path.split("{scene}")[0] + scene_dataset_ext = content_scenes_path.split("{scene}")[1] + content_dir = content_dir.format(data_path=dataset_dir) + if not os.path.exists(content_dir): + return scenes + + for filename in os.listdir(content_dir): + if filename.endswith(scene_dataset_ext): + scene = filename[: -len(scene_dataset_ext)] + scenes.append(scene) + scenes.sort() + return scenes + + @staticmethod + def dedup_goals(dataset: Dict[str, Any]) -> Dict[str, Any]: + if len(dataset["episodes"]) == 0: + return dataset + + goals_by_category = dict() + for i, ep in enumerate(dataset["episodes"]): + dataset["episodes"][i]["object_category"] = ep["goals"][0][ + "object_category" + ] + ep = SemanticAudioGoalNavEpisode(**ep) + + goals_key = ep.goals_key + if goals_key not in goals_by_category: + goals_by_category[goals_key] = ep.goals + + dataset["episodes"][i]["goals"] = [] + + dataset["goals_by_category"] = goals_by_category + + return dataset + + def to_json(self) -> str: + for i in range(len(self.episodes)): + self.episodes[i].goals = [] + + result = DatasetFloatJSONEncoder().encode(self) + + for i in range(len(self.episodes)): + self.episodes[i].goals = self.goals_by_category[ + self.episodes[i].goals_key + ] + + return result + + def __init__(self, config: Optional[Config] = None) -> None: + self.episodes = [] + self._config = config + + if config is None: + return + + datasetfile_path = config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + with gzip.open(datasetfile_path, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR, scene_filename=datasetfile_path) + + # Read separate file for each scene + dataset_dir = os.path.dirname(datasetfile_path) + scenes = config.CONTENT_SCENES + if ALL_SCENES_MASK in scenes: + scenes = SemanticAudioNavDataset._get_scenes_from_folder( + content_scenes_path=self.content_scenes_path, + dataset_dir=dataset_dir, + ) + + last_episode_cnt = 0 + ''' + # for debug + episode_ids = {} + ''' + for scene in scenes: + scene_filename = self.content_scenes_path.format( + data_path=dataset_dir, scene=scene + ) + with gzip.open(scene_filename, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR, scene_filename=scene_filename) + ''' + # for debug + episode_ids[self.episodes[-1].scene_id] = [] + ''' + num_episode = len(self.episodes) - last_episode_cnt + last_episode_cnt = len(self.episodes) + logging.debug('Sampled {} from {}'.format(num_episode, scene)) + logging.info(f"Sampled {len(self.episodes)} episodes from {len(scenes)} scenes.") + ''' + # for debug + t_cnt = 0 + if len( self.episodes)>0: + # print(self.episodes[0]) + # sys.exit() + for elem in self.episodes: + if elem.episode_id not in episode_ids: + episode_ids[elem.scene_id].append(elem.episode_id) + + for scene, epi in episode_ids.items(): + t_cnt += len(epi) + print('total episode ids', t_cnt) + ''' + + @staticmethod + def __deserialize_goal(serialized_goal: Dict[str, Any]) -> SemanticAudioGoal: + g = SemanticAudioGoal(**serialized_goal) + + for vidx, view in enumerate(g.view_points): + view_location = ObjectViewLocation(view, iou=0) + view_location.agent_state = AgentState(view_location.agent_state) + g.view_points[vidx] = view_location + + return g + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None, scene_filename: Optional[str] = None + ) -> None: + deserialized = json.loads(json_str) + if CONTENT_SCENES_PATH_FIELD in deserialized: + self.content_scenes_path = deserialized[CONTENT_SCENES_PATH_FIELD] + + # if "category_to_task_category_id" in deserialized: + # self.category_to_task_category_id = deserialized[ + # "category_to_task_category_id" + # ] + # + # if "category_to_scene_annotation_category_id" in deserialized: + # self.category_to_scene_annotation_category_id = deserialized[ + # "category_to_scene_annotation_category_id" + # ] + # + # if "category_to_mp3d_category_id" in deserialized: + # self.category_to_scene_annotation_category_id = deserialized[ + # "category_to_mp3d_category_id" + # ] + # + # assert len(self.category_to_task_category_id) == len( + # self.category_to_scene_annotation_category_id + # ) + + # assert set(self.category_to_task_category_id.keys()) == set( + # self.category_to_scene_annotation_category_id.keys() + # ), "category_to_task and category_to_mp3d must have the same keys" + + if len(deserialized["episodes"]) == 0: + return + + # if "goals_by_category" not in deserialized: + # deserialized = self.dedup_goals(deserialized) + # + # for k, v in deserialized["goals_by_category"].items(): + # self.goals_by_category[k] = [self.__deserialize_goal(g) for g in v] + + for i, episode in enumerate(deserialized["episodes"]): + episode = SemanticAudioGoalNavEpisode(**episode) + # episode.episode_id = str(i) + + if scenes_dir is not None: + if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): + episode.scene_id = episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX): + ] + + episode.scene_id = os.path.join(scenes_dir, episode.scene_id) + + for g_index, goal in enumerate(episode.goals): + episode.goals[g_index] = self.__deserialize_goal(goal) + if episode.shortest_paths is not None: + for path in episode.shortest_paths: + for p_index, point in enumerate(path): + path[p_index] = ShortestPathPoint(**point) + self.episodes.append(episode) + + # the agent can navigate to any instance of the target object category + # episode.goals = self.goals_by_category[episode.goals_key] + + # if episode.shortest_paths is not None: + # for path in episode.shortest_paths: + # for p_index, point in enumerate(path): + # if point is None or isinstance(point, (int, str)): + # point = { + # "action": point, + # "rotation": None, + # "position": None, + # } + # + # path[p_index] = ShortestPathPoint(**point) + + # self.episodes.append(episode) diff --git a/soundspaces/datasets/semantic_audionav_vln_dataset.py b/soundspaces/datasets/semantic_audionav_vln_dataset.py new file mode 100644 index 0000000..1ea44e9 --- /dev/null +++ b/soundspaces/datasets/semantic_audionav_vln_dataset.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +# Adapted from semantic_audionav_dataset.py + +# The MIT license below is in the original source at https://github.com/facebookresearch/sound-spaces/blob/main/soundspaces/datasets/semantic_audionav_dataset.py +# although the sound-spaces package is licensed as CC-BY-4.0 + +# Copyright (c) Facebook, Inc. and its affiliates. +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import json +import os +from typing import Any, Dict, List, Optional +import logging +import gzip + +from habitat.config import Config +from habitat.core.registry import registry +from habitat.core.simulator import AgentState, ShortestPathPoint +from habitat.core.dataset import Dataset +from habitat.core.utils import DatasetFloatJSONEncoder +from soundspaces.tasks.semantic_audiodialognav_task import SemanticAudioGoalDialogNavEpisode, SemanticAudioGoal, \ + ObjectViewLocation + +ALL_SCENES_MASK = "*" +CONTENT_SCENES_PATH_FIELD = "content_scenes_path" +DEFAULT_SCENE_PATH_PREFIX = "data/scene_dataset/" + + +@registry.register_dataset(name="SemanticAudioDialogNav") +class SemanticAudioDialogNavDataset(Dataset): + category_to_task_category_id: Dict[str, int] + category_to_scene_annotation_category_id: Dict[str, int] + episodes: List[SemanticAudioGoalDialogNavEpisode] + content_scenes_path: str = "{data_path}/content/{scene}.json.gz" + goals_by_category: Dict[str, List[SemanticAudioGoal]] + + @staticmethod + def check_config_paths_exist(config: Config) -> bool: + return os.path.exists( + config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + ) and os.path.exists(config.SCENES_DIR) + + @staticmethod + def get_scenes_to_load(config: Config, **kwargs) -> List[str]: + r"""Return list of scene ids for which dataset has separate files with + episodes. + """ + assert SemanticAudioDialogNavDataset.check_config_paths_exist(config), \ + (config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT), config.SCENES_DIR) + dataset_dir = os.path.dirname( + config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + ) + + cfg = config.clone() + cfg.defrost() + cfg.CONTENT_SCENES = [] + dataset = SemanticAudioDialogNavDataset(cfg) + return SemanticAudioDialogNavDataset._get_scenes_from_folder( + content_scenes_path=dataset.content_scenes_path, + dataset_dir=dataset_dir, + ) + + @staticmethod + def _get_scenes_from_folder(content_scenes_path, dataset_dir): + scenes = [] + content_dir = content_scenes_path.split("{scene}")[0] + scene_dataset_ext = content_scenes_path.split("{scene}")[1] + content_dir = content_dir.format(data_path=dataset_dir) + if not os.path.exists(content_dir): + return scenes + + for filename in os.listdir(content_dir): + if filename.endswith(scene_dataset_ext): + scene = filename[: -len(scene_dataset_ext)] + scenes.append(scene) + scenes.sort() + return scenes + + @staticmethod + def dedup_goals(dataset: Dict[str, Any]) -> Dict[str, Any]: + # ??? + if len(dataset["episodes"]) == 0: + return dataset + + goals_by_category = dict() + for i, ep in enumerate(dataset["episodes"]): + dataset["episodes"][i]["object_category"] = ep["goals"][0][ + "object_category" + ] + ep = SemanticAudioGoalDialogNavEpisode(**ep) + + goals_key = ep.goals_key + if goals_key not in goals_by_category: + goals_by_category[goals_key] = ep.goals + + dataset["episodes"][i]["goals"] = [] + + dataset["goals_by_category"] = goals_by_category + + return dataset + + def to_json(self) -> str: + for i in range(len(self.episodes)): + self.episodes[i].goals = [] + + result = DatasetFloatJSONEncoder().encode(self) + + for i in range(len(self.episodes)): + self.episodes[i].goals = self.goals_by_category[ + self.episodes[i].goals_key + ] + + return result + + def __init__(self, config: Optional[Config] = None) -> None: + self.episodes = [] + self._config = config + + if config is None: + return + + datasetfile_path = config.DATA_PATH.format(version=config.VERSION, split=config.SPLIT) + with gzip.open(datasetfile_path, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR, scene_filename=datasetfile_path) + + # Read separate file for each scene + dataset_dir = os.path.dirname(datasetfile_path) + scenes = config.CONTENT_SCENES + if ALL_SCENES_MASK in scenes: + scenes = SemanticAudioDialogNavDataset._get_scenes_from_folder( + content_scenes_path=self.content_scenes_path, + dataset_dir=dataset_dir, + ) + + last_episode_cnt = 0 + for scene in scenes: + scene_filename = self.content_scenes_path.format( + data_path=dataset_dir, scene=scene + ) + with gzip.open(scene_filename, "rt") as f: + self.from_json(f.read(), scenes_dir=config.SCENES_DIR, scene_filename=scene_filename) + + num_episode = len(self.episodes) - last_episode_cnt + last_episode_cnt = len(self.episodes) + logging.debug('Sampled {} from {}'.format(num_episode, scene)) + logging.info(f"Sampled {len(self.episodes)} episodes from {len(scenes)} scenes.") + + @staticmethod + def __deserialize_goal(serialized_goal: Dict[str, Any]) -> SemanticAudioGoal: + g = SemanticAudioGoal(**serialized_goal) + + for vidx, view in enumerate(g.view_points): + view_location = ObjectViewLocation(view, iou=0) + view_location.agent_state = AgentState(view_location.agent_state) + g.view_points[vidx] = view_location + + return g + + def from_json( + self, json_str: str, scenes_dir: Optional[str] = None, scene_filename: Optional[str] = None + ) -> None: + deserialized = json.loads(json_str) + if CONTENT_SCENES_PATH_FIELD in deserialized: + self.content_scenes_path = deserialized[CONTENT_SCENES_PATH_FIELD] + + # if "category_to_task_category_id" in deserialized: + # self.category_to_task_category_id = deserialized[ + # "category_to_task_category_id" + # ] + # + # if "category_to_scene_annotation_category_id" in deserialized: + # self.category_to_scene_annotation_category_id = deserialized[ + # "category_to_scene_annotation_category_id" + # ] + # + # if "category_to_mp3d_category_id" in deserialized: + # self.category_to_scene_annotation_category_id = deserialized[ + # "category_to_mp3d_category_id" + # ] + # + # assert len(self.category_to_task_category_id) == len( + # self.category_to_scene_annotation_category_id + # ) + + # assert set(self.category_to_task_category_id.keys()) == set( + # self.category_to_scene_annotation_category_id.keys() + # ), "category_to_task and category_to_mp3d must have the same keys" + + if len(deserialized["episodes"]) == 0: + return + + # if "goals_by_category" not in deserialized: + # deserialized = self.dedup_goals(deserialized) + # + # for k, v in deserialized["goals_by_category"].items(): + # self.goals_by_category[k] = [self.__deserialize_goal(g) for g in v] + + for i, episode in enumerate(deserialized["episodes"]): + episode = SemanticAudioGoalDialogNavEpisode(**episode) + # episode.episode_id = str(i) + + if scenes_dir is not None: + if episode.scene_id.startswith(DEFAULT_SCENE_PATH_PREFIX): + episode.scene_id = episode.scene_id[ + len(DEFAULT_SCENE_PATH_PREFIX): + ] + + episode.scene_id = os.path.join(scenes_dir, episode.scene_id) + + for g_index, goal in enumerate(episode.goals): + episode.goals[g_index] = self.__deserialize_goal(goal) + if episode.shortest_paths is not None: + for path in episode.shortest_paths: + for p_index, point in enumerate(path): + path[p_index] = ShortestPathPoint(**point) + self.episodes.append(episode) + + # the agent can navigate to any instance of the target object category + # episode.goals = self.goals_by_category[episode.goals_key] + + # if episode.shortest_paths is not None: + # for path in episode.shortest_paths: + # for p_index, point in enumerate(path): + # if point is None or isinstance(point, (int, str)): + # point = { + # "action": point, + # "rotation": None, + # "position": None, + # } + # + # path[p_index] = ShortestPathPoint(**point) + + # self.episodes.append(episode) diff --git a/soundspaces/mp3d_utils.py b/soundspaces/mp3d_utils.py new file mode 100644 index 0000000..2503945 --- /dev/null +++ b/soundspaces/mp3d_utils.py @@ -0,0 +1,197 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +from collections import defaultdict +import attr + +from scipy.spatial import cKDTree +import numpy as np +from numpy.linalg import norm + + +SCENE_SPLITS = { + 'train': ['sT4fr6TAbpF', 'E9uDoFAP3SH', 'VzqfbhrpDEA', 'kEZ7cmS4wCh', '29hnd4uzFmX', 'ac26ZMwG7aT', + 'i5noydFURQK', 's8pcmisQ38h', 'rPc6DW4iMge', 'EDJbREhghzL', 'mJXqzFtmKg4', 'B6ByNegPMKs', + 'JeFG25nYj2p', '82sE5b5pLXE', 'D7N2EKCX4Sj', '7y3sRwLe3Va', 'HxpKQynjfin', '5LpN3gDmAk7', + 'gTV8FGcVJC9', 'ur6pFq6Qu1A', 'qoiz87JEwZ2', 'PuKPg4mmafe', 'VLzqgDo317F', 'aayBHfsNo7d', + 'JmbYfDe2QKZ', 'XcA2TqTSSAj', '8WUmhLawc2A', 'sKLMLpTHeUy', 'r47D5H71a5s', 'Uxmj2M2itWa', + 'Pm6F8kyY3z2', 'p5wJjkQkbXX', '759xd9YjKW5', 'JF19kD82Mey', 'V2XKFyX4ASd', '1LXtFkjw3qL', + '17DRP5sb8fy', '5q7pvUzZiYa', 'VVfe2KiqLaN', 'Vvot9Ly1tCj', 'ULsKaCPVFJR', 'D7G3Y4RVNrH', + 'uNb9QFRL6hY', 'ZMojNkEp431', '2n8kARJN3HM', 'vyrNrziPKCB', 'e9zR4mvMWw7', 'r1Q1Z4BcV1o', + 'PX4nDJXEHrG', 'YmJkqBEsHnH', 'b8cTxDM8gDG', 'GdvgFV5R1Z5', 'pRbA3pwrgk9', 'jh4fc5c5qoQ', + '1pXnuDYAj8r', 'S9hNv5qa7GM', 'VFuaQ6m2Qom', 'cV4RVeZvu5T', 'SN83YJsR3w2'], + 'val': ['x8F5xyUWy9e', 'QUCTc6BB5sX', 'EU6Fwq7SyZv', '2azQ1b91cZZ', 'Z6MFQCViBuw', 'pLe4wQe7qrG', + 'oLBMNvg9in8', 'X7HyMhZNoso', 'zsNo4HB9uLZ', 'TbHJrupSAjP', '8194nk5LbLH'], + 'test': ['pa4otMbVnkk', 'yqstnuAEVhm', '5ZKStnWn8Zo', 'Vt2qJdWjCF2', 'wc2JMjhGNzB', 'WYY7iVyf5p8', + 'fzynW3qQPVF', 'UwV83HsGsw3', 'q9vSo1VnCiC', 'ARNzJeq3xxb', 'rqfALeAoiTq', 'gYvKGZ5eRqb', + 'YFuZgdQ5vWj', 'jtcxE69GiFV', 'gxdoqLR6rwA'], +} +SCENE_SPLITS['train_distractor'] = SCENE_SPLITS['train'] +SCENE_SPLITS['val_distractor'] = SCENE_SPLITS['val'] +SCENE_SPLITS['test_distractor'] = SCENE_SPLITS['test'] + +MPCAT40_CATEGORY_INDICES = [3, 5, 6, 7, 8, 10, 11, 13, 14, 15, 18, 19, 20, 22, 23, 25, 26, 27, 33, 34, 38] + + +CATEGORY_INDEX_MAPPING = { + 'chair': 0, + 'table': 1, + 'picture': 2, + 'cabinet': 3, + 'cushion': 4, + 'sofa': 5, + 'bed': 6, + 'chest_of_drawers': 7, + 'plant': 8, + 'sink': 9, + 'toilet': 10, + 'stool': 11, + 'towel': 12, + 'tv_monitor': 13, + 'shower': 14, + 'bathtub': 15, + 'counter': 16, + 'fireplace': 17, + 'gym_equipment': 18, + 'seating': 19, + 'clothes': 20 + } + + +@attr.s +class Object: + object_index = attr.ib(converter=int) + region_index = attr.ib(converter=int) + category_index = attr.ib(converter=int) + px = attr.ib(converter=float) + py = attr.ib(converter=float) + pz = attr.ib(converter=float) + a0x = attr.ib(converter=float) + a0y = attr.ib(converter=float) + a0z = attr.ib(converter=float) + a1x = attr.ib(converter=float) + a1y = attr.ib(converter=float) + a1z = attr.ib(converter=float) + r0 = attr.ib(converter=float) + r1 = attr.ib(converter=float) + r2 = attr.ib(converter=float) + + +class HouseReader: + """ + The .house file has a sequence of ascii lines with fields separated by spaces in the following format: + + H name label #images #panoramas #vertices #surfaces #segments #objects #categories #regions #portals #levels 0 0 0 0 0 xlo ylo zlo xhi yhi zhi 0 0 0 0 0 + L level_index #regions label px py pz xlo ylo zlo xhi yhi zhi 0 0 0 0 0 + R region_index level_index 0 0 label px py pz xlo ylo zlo xhi yhi zhi height 0 0 0 0 + P portal_index region0_index region1_index label xlo ylo zlo xhi yhi zhi 0 0 0 0 + S surface_index region_index 0 label px py pz nx ny nz xlo ylo zlo xhi yhi zhi 0 0 0 0 0 + V vertex_index surface_index label px py pz nx ny nz 0 0 0 + P name panorama_index region_index 0 px py pz 0 0 0 0 0 + I image_index panorama_index name camera_index yaw_index e00 e01 e02 e03 e10 e11 e12 e13 e20 e21 e22 e23 e30 e31 e32 e33 i00 i01 i02 i10 i11 i12 i20 i21 i22 width height px py pz 0 0 0 0 0 + C category_index category_mapping_index category_mapping_name mpcat40_index mpcat40_name 0 0 0 0 0 + O object_index region_index category_index px py pz a0x a0y a0z a1x a1y a1z r0 r1 r2 0 0 0 0 0 0 0 0 + E segment_index object_index id area px py pz xlo ylo zlo xhi yhi zhi 0 0 0 0 0 + + where xxx_index indicates the index of the xxx in the house file (starting at 0), + #xxxs indicates how many xxxs will appear later in the file that back reference (associate) to this entry, + (px,py,pz) is a representative position, (nx,ny,nz) is a normal direction, + (xlo, ylo, zlo, xhi, yhi, zhi) is an axis-aligned bounding box, + camera_index is in [0-5], yaw_index is in [0-2],a + (e00 e01 e02 e03 e10 e11 e12 e13 e20 e21 e22 e23 e30 e31 e32 e33) are the extrinsic matrix of a camera, + (i00 i01 i02 i10 i11 i12 i20 i21 i22) are the intrinsic matrix for a camera, + (px, py, pz, a0x, a0y, a0z, a1x, a1y, a1z, r0, r1, r2) define the center, axis directions, and radii of an oriented bounding box, + height is the distance from the floor, and + 0 is a value that can be ignored. + + The extent of each region is defined by a prism with its vertical extent dictated by its height and + its horizontal cross-section dictated by the counter-clockwise set of polygon vertices associated + with each surface associated with the region. + + The extent of each object is defined by the oriented bounding box of the 'O' command. + The set of faces associated with each segment are ones whose 'face_material' field + in the xxx.ply file (described next) matches the segment 'id' in the 'S' command. + """ + def __init__(self, house_file): + self.data = defaultdict(list) + self.category_index2mpcat40_index = dict() + self.category_index2mpcat40_name = dict() + + with open(house_file, 'r') as fo: + annotations = fo.readlines() + for line in annotations[1:]: + tokens = line.split() + if tokens[0] == 'C': + category_index = int(tokens[1]) + mpcat40_index = int(tokens[4]) + mpcat40_name = tokens[5] + self.category_index2mpcat40_index[category_index] = mpcat40_index + self.category_index2mpcat40_name[category_index] = mpcat40_name + elif tokens[0] == 'O': + obj = Object(*tokens[1:16]) + self.data[tokens[0]].append(obj) + else: + self.data[tokens[0]].append(tokens[1:]) + + def find_objects_with_mpcat40_index(self, mpcat40_index): + found_objects = list() + for obj in self.data['O']: + if obj.category_index == -1: + # logging.warning('Category index: {}'.format(obj.category_index)) + continue + elif self.category_index2mpcat40_index[obj.category_index] == mpcat40_index: + found_objects.append(obj) + return found_objects + + def find_objects_with_mpcat40_indices(self): + objects = [] + for index in MPCAT40_CATEGORY_INDICES: + objects += self.find_objects_with_mpcat40_index(index) + return objects + + def find_objects_close_to(self, objects, points, threshold=1): + points = np.array(points) + kd_tree = cKDTree(points[:, [0, 2]]) + + num_object = 0 + if len(objects) > 0: + obj_pos = np.array([(obj.px, -obj.py) for obj in objects]) + d, _ = kd_tree.query(obj_pos) + num_object = sum(d < threshold) + + return num_object + + def find_points_in_bbx(self, points, objects, tol=1): + points = np.array(points) + points = np.stack([points[:, 0], -points[:, 2], points[:, 1] + 1.5], axis=-1) + + num_object = 0 + if len(objects) > 0: + for obj in objects: + v = points - np.array([obj.px, obj.py, obj.pz]) + a0 = np.array([obj.a0x, obj.a0y, obj.a0z]) + a1 = np.array([obj.a1x, obj.a1y, obj.a1z]) + a2 = np.cross(a0, a1) / np.linalg.norm(np.cross(a0, a1)) + d0 = np.inner(v, a0) + d1 = np.inner(v, a1) + d2 = np.inner(v, a2) + inside_bbx = (abs(d0) < obj.r0 + tol) & (abs(d1) < obj.r1 + tol) & (abs(d2) < obj.r2 + tol) + # print('{} points are near object, distances: {}/{}/{}'.format(sum(inside_bbx), d0, d1, d2)) + num_object += any(inside_bbx) + + return num_object + + def compute_object_to_category_index_mapping(self): + objects = self.data['O'] + mapping = dict() + for obj in objects: + if obj.category_index == -1: + mpcat40_index = -1 + else: + mpcat40_index = self.category_index2mpcat40_index[obj.category_index] + mapping[obj.object_index] = mpcat40_index + + return mapping + diff --git a/soundspaces/simulator.py b/soundspaces/simulator.py new file mode 100644 index 0000000..44a2d61 --- /dev/null +++ b/soundspaces/simulator.py @@ -0,0 +1,834 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +from typing import Any, List, Optional +from abc import ABC +from collections import defaultdict, namedtuple +import logging +import time +import pickle +import os, sys +import json + +import librosa +import scipy +from scipy.io import wavfile +from scipy.signal import fftconvolve +import numpy as np +import networkx as nx +from gym import spaces + +from habitat.core.registry import registry +import habitat_sim +from habitat_sim.utils.common import quat_from_angle_axis, quat_from_coeffs, quat_to_angle_axis +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from habitat.core.simulator import ( + AgentState, + Config, + Observations, + SensorSuite, + ShortestPathPoint, + Simulator, +) +from soundspaces.utils import load_metadata +from soundspaces.mp3d_utils import HouseReader + + +def overwrite_config(config_from: Config, config_to: Any) -> None: + r"""Takes Habitat-API config and Habitat-Sim config structures. Overwrites + Habitat-Sim config with Habitat-API values, where a field name is present + in lowercase. Mostly used to avoid :ref:`sim_cfg.field = hapi_cfg.FIELD` + code. + Args: + config_from: Habitat-API config node. + config_to: Habitat-Sim config structure. + """ + + def if_config_to_lower(config): + if isinstance(config, Config): + return {key.lower(): val for key, val in config.items()} + else: + return config + + for attr, value in config_from.items(): + if hasattr(config_to, attr.lower()): + setattr(config_to, attr.lower(), if_config_to_lower(value)) + + +class DummySimulator: + """ + Dummy simulator for avoiding loading the scene meshes when using cached observations. + """ + def __init__(self): + self.position = None + self.rotation = None + self._sim_obs = None + + def seed(self, seed): + pass + + def set_agent_state(self, position, rotation): + self.position = np.array(position, dtype=np.float32) + self.rotation = rotation + + def get_agent_state(self): + class State: + def __init__(self, position, rotation): + self.position = position + self.rotation = rotation + + return State(self.position, self.rotation) + + def set_sensor_observations(self, sim_obs): + self._sim_obs = sim_obs + + def get_sensor_observations(self): + return self._sim_obs + + def close(self): + pass + + +@registry.register_simulator() +class SoundSpacesSim(Simulator, ABC): + r"""Changes made to simulator wrapper over habitat-sim + + This simulator first loads the graph of current environment and moves the agent among nodes. + Any sounds can be specified in the episode and loaded in this simulator. + Args: + config: configuration for initializing the simulator. + """ + + def action_space_shortest_path(self, source: AgentState, targets: List[AgentState], agent_id: int = 0) -> List[ + ShortestPathPoint]: + pass + + def __init__(self, config: Config) -> None: + self.config = config + agent_config = self._get_agent_config() + + + sim_sensors = [] + + # print(agent_config.SENSORS) + + for sensor_name in agent_config.SENSORS: + + sensor_cfg = getattr(self.config, sensor_name) + sensor_type = registry.get_sensor(sensor_cfg.TYPE) + + assert sensor_type is not None, "invalid sensor type {}".format( + sensor_cfg.TYPE + ) + sim_sensors.append(sensor_type(sensor_cfg)) + + self._sensor_suite = SensorSuite(sim_sensors) + + # ----------------------------------------------- + # print(self._sensor_suite.sensors.values()) + # ----------------------------------------------- + + self.sim_config = self.create_sim_config(self._sensor_suite) + self._current_scene = self.sim_config.sim_cfg.scene_id + self._action_space = spaces.Discrete( + len(self.sim_config.agents[0].action_space) + ) + + # ----------------------------------------------- + # print('self.sim_config.agents[0].action_space', self.sim_config.agents[0].action_space) + # ----------------------------------------------- + + self._prev_sim_obs = None + self._source_position_index = None + self._receiver_position_index = None + self._rotation_angle = None + self._current_sound = None + self._offset = None + self._duration = None + self._audio_index = None + self._audio_length = None + self._source_sound_dict = dict() + self._sampling_rate = None + self._node2index = None + self._frame_cache = dict() + self._audiogoal_cache = dict() + self._spectrogram_cache = dict() + self._egomap_cache = defaultdict(dict) + self._scene_observations = None + self._episode_step_count = None + self._is_episode_active = None + self._position_to_index_mapping = dict() + self._previous_step_collided = False + self._instance2label_mapping = None + self._house_readers = dict() + self._use_oracle_planner = True + self._oracle_actions = list() + + # dialog pretraining + self._sub_instr = None + + + self.points, self.graph = load_metadata(self.metadata_dir) + for node in self.graph.nodes(): + self._position_to_index_mapping[self.position_encoding(self.graph.nodes()[node]['point'])] = node + + # ------------------------------------- + # also need to compute shortest path for all the points + self.paths = dict(nx.all_pairs_dijkstra_path(self.graph)) + + if self.config.AUDIO.HAS_DISTRACTOR_SOUND: + self._distractor_position_index = None + self._current_distractor_sound = None + + if self.config.USE_RENDERED_OBSERVATIONS: + self._sim = DummySimulator() + with open(self.current_scene_observation_file, 'rb') as fo: + self._frame_cache = pickle.load(fo) + else: + self._sim = habitat_sim.Simulator(config=self.sim_config) + + + with open('./data/node2view.json', 'r') as f: + self._node2view = json.load(f) + + + # ---------------------------------------------------------------------- + # Should initialize the dialog models here + # --------- + # ques gen + # self.ques_module = Ques_Gen() + + + + def create_sim_config( + self, _sensor_suite: SensorSuite + ) -> habitat_sim.Configuration: + sim_config = habitat_sim.SimulatorConfiguration() + overwrite_config( + config_from=self.config.HABITAT_SIM_V0, config_to=sim_config + ) + sim_config.scene_id = self.config.SCENE + agent_config = habitat_sim.AgentConfiguration() + overwrite_config( + config_from=self.get_agent_config(), config_to=agent_config + ) + + sensor_specifications = [] + for sensor in _sensor_suite.sensors.values(): + sim_sensor_cfg = habitat_sim.SensorSpec() + overwrite_config( + config_from=sensor.config, config_to=sim_sensor_cfg + ) + sim_sensor_cfg.uuid = sensor.uuid + sim_sensor_cfg.resolution = list( + sensor.observation_space.shape[:2] + ) + sim_sensor_cfg.parameters["hfov"] = str(sensor.config.HFOV) + + # accessing child attributes through parent interface + sim_sensor_cfg.sensor_type = sensor.sim_sensor_type # type: ignore + sim_sensor_cfg.gpu2gpu_transfer = ( + self.config.HABITAT_SIM_V0.GPU_GPU + ) + sensor_specifications.append(sim_sensor_cfg) + + agent_config.sensor_specifications = sensor_specifications + agent_config.action_space = registry.get_action_space_configuration( + self.config.ACTION_SPACE_CONFIG + )(self.config).get() + + return habitat_sim.Configuration(sim_config, [agent_config]) + + @property + def sensor_suite(self) -> SensorSuite: + return self._sensor_suite + + def get_agent_config(self, agent_id: Optional[int] = None) -> Any: + if agent_id is None: + agent_id = self.config.DEFAULT_AGENT_ID + agent_name = self.config.AGENTS[agent_id] + agent_config = getattr(self.config, agent_name) + return agent_config + + def _update_agents_state(self) -> bool: + is_updated = False + for agent_id, _ in enumerate(self.config.AGENTS): + agent_cfg = self._get_agent_config(agent_id) + if agent_cfg.IS_SET_START_STATE: + self.set_agent_state( + agent_cfg.START_POSITION, + agent_cfg.START_ROTATION, + agent_id, + ) + is_updated = True + + return is_updated + + def _get_agent_config(self, agent_id: Optional[int] = None) -> Any: + if agent_id is None: + agent_id = self.config.DEFAULT_AGENT_ID + agent_name = self.config.AGENTS[agent_id] + agent_config = getattr(self.config, agent_name) + return agent_config + + def get_agent_state(self, agent_id: int = 0) -> habitat_sim.AgentState: + if self.config.USE_RENDERED_OBSERVATIONS: + return self._sim.get_agent_state() + else: + return self._sim.get_agent(agent_id).get_state() + + def set_agent_state( + self, + position: List[float], + rotation: List[float], + agent_id: int = 0, + reset_sensors: bool = True, + ) -> bool: + if self.config.USE_RENDERED_OBSERVATIONS: + self._sim.set_agent_state(position, rotation) + else: + agent = self._sim.get_agent(agent_id) + new_state = self.get_agent_state(agent_id) + new_state.position = position + new_state.rotation = rotation + + # NB: The agent state also contains the sensor states in _absolute_ + # coordinates. In order to set the agent's body to a specific + # location and have the sensors follow, we must not provide any + # state for the sensors. This will cause them to follow the agent's + # body + new_state.sensor_states = {} + agent.set_state(new_state, reset_sensors) + return True + + @property + def binaural_rir_dir(self): + return os.path.join(self.config.AUDIO.BINAURAL_RIR_DIR, self.config.SCENE_DATASET, self.current_scene_name) + + @property + def source_sound_dir(self): + return self.config.AUDIO.SOURCE_SOUND_DIR + + @property + def distractor_sound_dir(self): + return self.config.AUDIO.DISTRACTOR_SOUND_DIR + + @property + def metadata_dir(self): + return os.path.join(self.config.AUDIO.METADATA_DIR, self.config.SCENE_DATASET, self.current_scene_name) + + @property + def current_scene_name(self): + # config.SCENE (_current_scene) looks like 'data/scene_datasets/replica/office_1/habitat/mesh_semantic.ply' + return self._current_scene.split('/')[3] + + @property + def current_scene_observation_file(self): + return os.path.join(self.config.SCENE_OBSERVATION_DIR, self.config.SCENE_DATASET, + self.current_scene_name + '.pkl') + + @property + def current_source_sound(self): + return self._source_sound_dict[self._current_sound] + + @property + def is_silent(self): + return self._episode_step_count > self._duration + + @property + def pathfinder(self): + return self._sim.pathfinder + + def get_agent(self, agent_id): + return self._sim.get_agent(agent_id) + + def reconfigure(self, config: Config) -> None: + self.config = config + if hasattr(self.config.AGENT_0, 'OFFSET'): + self._offset = int(self.config.AGENT_0.OFFSET) + else: + self._offset = 0 + if self.config.AUDIO.EVERLASTING: + self._duration = 500 + else: + assert hasattr(self.config.AGENT_0, 'DURATION') + self._duration = int(self.config.AGENT_0.DURATION) + self._audio_index = 0 + is_same_sound = config.AGENT_0.SOUND_ID == self._current_sound + if not is_same_sound: + self._current_sound = self.config.AGENT_0.SOUND_ID + self._load_single_source_sound() + logging.debug("Switch to sound {} with duration {} seconds".format(self._current_sound, self._duration)) + + is_same_scene = config.SCENE == self._current_scene + if not is_same_scene: + self._current_scene = config.SCENE + logging.debug('Current scene: {} and sound: {}'.format(self.current_scene_name, self._current_sound)) + + if self.config.USE_RENDERED_OBSERVATIONS: + with open(self.current_scene_observation_file, 'rb') as fo: + self._frame_cache = pickle.load(fo) + else: + self._sim.close() + del self._sim + self.sim_config = self.create_sim_config(self._sensor_suite) + self._sim = habitat_sim.Simulator(self.sim_config) + self._update_agents_state() + self._frame_cache = dict() + logging.debug('Loaded scene {}'.format(self.current_scene_name)) + + self.points, self.graph = load_metadata(self.metadata_dir) + for node in self.graph.nodes(): + self._position_to_index_mapping[self.position_encoding(self.graph.nodes()[node]['point'])] = node + self._instance2label_mapping = None + self.paths = dict(nx.all_pairs_dijkstra_path(self.graph)) + + # dialog pretraining + self._sub_instr = self.config.AGENT_0.SUB_INSTR + + if not is_same_scene or not is_same_sound: + self._audiogoal_cache = dict() + self._spectrogram_cache = dict() + + self._episode_step_count = 0 + + # set agent positions + self._receiver_position_index = self._position_to_index(self.config.AGENT_0.START_POSITION) + self._source_position_index = self._position_to_index(self.config.AGENT_0.GOAL_POSITION) + # the agent rotates about +Y starting from -Z counterclockwise, + # so rotation angle 90 means the agent rotate about +Y 90 degrees + self._rotation_angle = int(np.around(np.rad2deg(quat_to_angle_axis(quat_from_coeffs( + self.config.AGENT_0.START_ROTATION))[0]))) % 360 + if self.config.USE_RENDERED_OBSERVATIONS: + self._sim.set_agent_state(list(self.graph.nodes[self._receiver_position_index]['point']), + quat_from_coeffs(self.config.AGENT_0.START_ROTATION)) + else: + self.set_agent_state(list(self.graph.nodes[self._receiver_position_index]['point']), + self.config.AGENT_0.START_ROTATION) + + if self.config.AUDIO.HAS_DISTRACTOR_SOUND: + self._distractor_position_index = self.config.AGENT_0.DISTRACTOR_POSITION_INDEX + self._current_distractor_sound = self.config.AGENT_0.DISTRACTOR_SOUND_ID + self._load_single_distractor_sound() + + if self._use_oracle_planner: + self._oracle_actions = self.compute_oracle_actions() + + logging.debug("Initial source, agent at: {}, {}, orientation: {}". + format(self._source_position_index, self._receiver_position_index, self.get_orientation())) + + def compute_semantic_index_mapping(self): + # obtain mapping from instance id to semantic label id + if isinstance(self._sim, DummySimulator): + if self._current_scene not in self._house_readers: + self._house_readers[self._current_sound] = HouseReader(self._current_scene.replace('.glb', '.house')) + reader = self._house_readers[self._current_sound] + instance_id_to_label_id = reader.compute_object_to_category_index_mapping() + else: + scene = self._sim.semantic_scene + instance_id_to_label_id = {int(obj.id.split("_")[-1]): obj.category.index() for obj in scene.objects} + self._instance2label_mapping = np.array([instance_id_to_label_id[i] for i in range(len(instance_id_to_label_id))]) + + @staticmethod + def position_encoding(position): + return '{:.2f}_{:.2f}_{:.2f}'.format(*position) + + def _position_to_index(self, position): + if self.position_encoding(position) in self._position_to_index_mapping: + return self._position_to_index_mapping[self.position_encoding(position)] + else: + raise ValueError("Position misalignment.") + + def _get_sim_observation(self): + joint_index = (self._receiver_position_index, self._rotation_angle) + if joint_index in self._frame_cache: + return self._frame_cache[joint_index] + else: + assert not self.config.USE_RENDERED_OBSERVATIONS + sim_obs = self._sim.get_sensor_observations() + for sensor in sim_obs: + sim_obs[sensor] = sim_obs[sensor] + self._frame_cache[joint_index] = sim_obs + return sim_obs + + def reset(self): + logging.debug('Reset simulation') + if self.config.USE_RENDERED_OBSERVATIONS: + sim_obs = self._get_sim_observation() + self._sim.set_sensor_observations(sim_obs) + else: + sim_obs = self._sim.reset() + if self._update_agents_state(): + sim_obs = self._get_sim_observation() + + self._is_episode_active = True + self._prev_sim_obs = sim_obs + self._previous_step_collided = False + # Encapsule data under Observations class + observations = self._sensor_suite.get_observations(sim_obs) + + return observations + + def step(self, action, only_allowed=True): + """ + All angle calculations in this function is w.r.t habitat coordinate frame, on X-Z plane + where +Y is upward, -Z is forward and +X is rightward. + Angle 0 corresponds to +X, angle 90 corresponds to +y and 290 corresponds to 270. + + :param action: action to be taken + :param only_allowed: if true, then can't step anywhere except allowed locations + :return: + Dict of observations + """ + + assert self._is_episode_active, ( + "episode is not active, environment not RESET or " + "STOP action called previously" + ) + + self._previous_step_collided = False + # STOP: 0, FORWARD: 1, LEFT: 2, RIGHT: 3, # QUERY: 9 (need to change it to 4) (or no need?) + # need to update the action space (specify distance and angle since we are using different action space) + if action == HabitatSimActions.STOP: + self._is_episode_active = False + + else: + prev_position_index = self._receiver_position_index + prev_rotation_angle = self._rotation_angle + if action == HabitatSimActions.MOVE_FORWARD: + # the agent initially faces -Z by default + self._previous_step_collided = True + for neighbor in self.graph[self._receiver_position_index]: + p1 = self.graph.nodes[self._receiver_position_index]['point'] + p2 = self.graph.nodes[neighbor]['point'] + direction = int(np.around(np.rad2deg(np.arctan2(p2[2] - p1[2], p2[0] - p1[0])))) % 360 + if direction == self.get_orientation(): + self._receiver_position_index = neighbor + self._previous_step_collided = False + break + elif action == HabitatSimActions.TURN_LEFT: + # agent rotates counterclockwise, so turning left means increasing rotation angle by 90 + self._rotation_angle = (self._rotation_angle + 90) % 360 + elif action == HabitatSimActions.TURN_RIGHT: + self._rotation_angle = (self._rotation_angle - 90) % 360 + ''' + # ----------------------------------------- + elif action == HabitatSimActions.QUERY: + # print('action for query', action) + # ------------------------------------- + # print('QUERY action') + # check to generate ques + # get observation + # sim_obs = self._get_sim_observation() + # print(sim_obs['rgb']) + # print(self.ques_module.ques_out(sim_obs['rgb'])) + # sys.exit() + + # for sanity check we are cosidering two forward steps + forward_cnt = 0 + + for i in range(2): + self._previous_step_collided = True + for neighbor in self.graph[self._receiver_position_index]: + p1 = self.graph.nodes[self._receiver_position_index]['point'] + p2 = self.graph.nodes[neighbor]['point'] + direction = int(np.around(np.rad2deg(np.arctan2(p2[2] - p1[2], p2[0] - p1[0])))) % 360 + if direction == self.get_orientation(): + self._receiver_position_index = neighbor + self._previous_step_collided = False + forward_cnt += 1 + break + + ''' + if self.config.CONTINUOUS_VIEW_CHANGE: + intermediate_observations = list() + fps = self.config.VIEW_CHANGE_FPS + if action == HabitatSimActions.MOVE_FORWARD: + prev_position = np.array(self.graph.nodes[prev_position_index]['point']) + current_position = np.array(self.graph.nodes[self._receiver_position_index]['point']) + for i in range(1, fps): + intermediate_position = prev_position + i / fps * (current_position - prev_position) + self.set_agent_state(intermediate_position.tolist(), quat_from_angle_axis(np.deg2rad( + self._rotation_angle), np.array([0, 1, 0]))) + sim_obs = self._sim.get_sensor_observations() + observations = self._sensor_suite.get_observations(sim_obs) + intermediate_observations.append(observations) + + + else: + for i in range(1, fps): + if action == HabitatSimActions.TURN_LEFT: + intermediate_rotation = prev_rotation_angle + i / fps * 90 + elif action == HabitatSimActions.TURN_RIGHT: + intermediate_rotation = prev_rotation_angle - i / fps * 90 + self.set_agent_state(list(self.graph.nodes[self._receiver_position_index]['point']), + quat_from_angle_axis(np.deg2rad(intermediate_rotation), + np.array([0, 1, 0]))) + sim_obs = self._sim.get_sensor_observations() + observations = self._sensor_suite.get_observations(sim_obs) + intermediate_observations.append(observations) + + self.set_agent_state(list(self.graph.nodes[self._receiver_position_index]['point']), + quat_from_angle_axis(np.deg2rad(self._rotation_angle), np.array([0, 1, 0]))) + self._episode_step_count += 1 + + # log debugging info + logging.debug('After taking action {}, s,r: {}, {}, orientation: {}, location: {}'.format( + action, self._source_position_index, self._receiver_position_index, + self.get_orientation(), self.graph.nodes[self._receiver_position_index]['point'])) + + sim_obs = self._get_sim_observation() + if self.config.USE_RENDERED_OBSERVATIONS: + self._sim.set_sensor_observations(sim_obs) + self._prev_sim_obs = sim_obs + observations = self._sensor_suite.get_observations(sim_obs) + if self.config.CONTINUOUS_VIEW_CHANGE: + observations['intermediate'] = intermediate_observations + + return observations + + def get_orientation(self): + _base_orientation = 270 + return (_base_orientation - self._rotation_angle) % 360 + + @property + def azimuth_angle(self): + # this is the angle used to index the binaural audio files + # in mesh coordinate systems, +Y forward, +X rightward, +Z upward + # azimuth is calculated clockwise so +Y is 0 and +X is 90 + return -(self._rotation_angle + 0) % 360 + + @property + def reaching_goal(self): + return self._source_position_index == self._receiver_position_index + + def _load_source_sounds(self): + # load all mono files at once + + sound_files = os.listdir(self.source_sound_dir) + for sound_file in sound_files: + sound = sound_file.split('.')[0] + audio_data, sr = librosa.load(os.path.join(self.source_sound_dir, sound), + sr=self.config.AUDIO.RIR_SAMPLING_RATE) + self._source_sound_dict[sound] = audio_data + self._audio_length = audio_data.shape[0] // self.config.AUDIO.RIR_SAMPLING_RATE + + def _load_single_distractor_sound(self): + # print('self._source_sound_dict', self._source_sound_dict) + # print('self._current_distractor_sound', self._current_distractor_sound) + + if self._current_distractor_sound not in self._source_sound_dict: + + audio_data, sr = librosa.load(os.path.join(self.distractor_sound_dir, self._current_distractor_sound), + sr=self.config.AUDIO.RIR_SAMPLING_RATE) + self._source_sound_dict[self._current_distractor_sound] = audio_data + + def _load_single_source_sound(self): + + if self._current_sound not in self._source_sound_dict: + audio_data, sr = librosa.load(os.path.join(self.source_sound_dir, self._current_sound), + sr=self.config.AUDIO.RIR_SAMPLING_RATE) + self._source_sound_dict[self._current_sound] = audio_data + self._audio_length = self._source_sound_dict[self._current_sound].shape[0]//self.config.AUDIO.RIR_SAMPLING_RATE + + def _compute_euclidean_distance_between_sr_locations(self): + p1 = self.graph.nodes[self._receiver_position_index]['point'] + p2 = self.graph.nodes[self._source_position_index]['point'] + d = np.sqrt((p1[0] - p2[0])**2 + (p1[2] - p2[2])**2) + return d + + def _compute_audiogoal(self): + sampling_rate = self.config.AUDIO.RIR_SAMPLING_RATE + if self._episode_step_count > self._duration: + logging.debug('Step count is greater than duration. Empty spectrogram.') + audiogoal = np.zeros((2, sampling_rate)) + else: + binaural_rir_file = os.path.join(self.binaural_rir_dir, str(self.azimuth_angle), '{}_{}.wav'.format( + self._receiver_position_index, self._source_position_index)) + try: + sampling_freq, binaural_rir = wavfile.read(binaural_rir_file) # float32 + except ValueError: + logging.warning("{} file is not readable".format(binaural_rir_file)) + binaural_rir = np.zeros((sampling_rate, 2)).astype(np.float32) + if len(binaural_rir) == 0: + logging.debug("Empty RIR file at {}".format(binaural_rir_file)) + binaural_rir = np.zeros((sampling_rate, 2)).astype(np.float32) + + # by default, convolve in full mode, which preserves the direct sound + if self.current_source_sound.shape[0] == sampling_rate: + binaural_convolved = np.array([fftconvolve(self.current_source_sound, binaural_rir[:, channel] + ) for channel in range(binaural_rir.shape[-1])]) + audiogoal = binaural_convolved[:, :sampling_rate] + else: + index = self._audio_index + self._audio_index = (self._audio_index + 1) % self._audio_length + if index * sampling_rate - binaural_rir.shape[0] < 0: + source_sound = self.current_source_sound[: (index + 1) * sampling_rate] + binaural_convolved = np.array([fftconvolve(source_sound, binaural_rir[:, channel] + ) for channel in range(binaural_rir.shape[-1])]) + audiogoal = binaural_convolved[:, index * sampling_rate: (index + 1) * sampling_rate] + else: + # include reverb from previous time step + source_sound = self.current_source_sound[index * sampling_rate - binaural_rir.shape[0] + 1 + : (index + 1) * sampling_rate] + binaural_convolved = np.array([fftconvolve(source_sound, binaural_rir[:, channel], mode='valid', + ) for channel in range(binaural_rir.shape[-1])]) + audiogoal = binaural_convolved + + if self.config.AUDIO.HAS_DISTRACTOR_SOUND: + binaural_rir_file = os.path.join(self.binaural_rir_dir, str(self.azimuth_angle), '{}_{}.wav'.format( + self._receiver_position_index, self._distractor_position_index)) + try: + sampling_freq, distractor_rir = wavfile.read(binaural_rir_file) + except ValueError: + logging.warning("{} file is not readable".format(binaural_rir_file)) + distractor_rir = np.zeros((self.config.AUDIO.RIR_SAMPLING_RATE, 2)).astype(np.float32) + if len(distractor_rir) == 0: + logging.debug("Empty RIR file at {}".format(binaural_rir_file)) + distractor_rir = np.zeros((self.config.AUDIO.RIR_SAMPLING_RATE, 2)).astype(np.float32) + + distractor_convolved = np.array([fftconvolve(self._source_sound_dict[self._current_distractor_sound], + distractor_rir[:, channel] + ) for channel in range(distractor_rir.shape[-1])]) + audiogoal += distractor_convolved[:, :sampling_rate] + + return audiogoal + + def get_egomap_observation(self): + joint_index = (self._receiver_position_index, self._rotation_angle) + if joint_index in self._egomap_cache[self._current_scene]: + return self._egomap_cache[self._current_scene][joint_index] + else: + return None + + def cache_egomap_observation(self, egomap): + self._egomap_cache[self._current_scene][(self._receiver_position_index, self._rotation_angle)] = egomap + + def get_current_audiogoal_observation(self): + if self.config.AUDIO.HAS_DISTRACTOR_SOUND: + # by default, does not cache for distractor sound + audiogoal = self._compute_audiogoal() + else: + joint_index = (self._source_position_index, self._receiver_position_index, self.azimuth_angle) + if joint_index not in self._audiogoal_cache: + self._audiogoal_cache[joint_index] = self._compute_audiogoal() + audiogoal = self._audiogoal_cache[joint_index] + + return audiogoal + + def get_current_spectrogram_observation(self, audiogoal2spectrogram): + if self.config.AUDIO.HAS_DISTRACTOR_SOUND: + audiogoal = self.get_current_audiogoal_observation() + spectrogram = audiogoal2spectrogram(audiogoal) + else: + joint_index = (self._source_position_index, self._receiver_position_index, self.azimuth_angle) + if joint_index not in self._spectrogram_cache: + audiogoal = self.get_current_audiogoal_observation() + self._spectrogram_cache[joint_index] = audiogoal2spectrogram(audiogoal) + spectrogram = self._spectrogram_cache[joint_index] + + return spectrogram + + def geodesic_distance(self, position_a, position_bs, episode=None): + distances = [] + for position_b in position_bs: + index_a = self._position_to_index(position_a) + index_b = self._position_to_index(position_b) + assert index_a is not None and index_b is not None + path_length = nx.shortest_path_length(self.graph, index_a, index_b) * self.config.GRID_SIZE + distances.append(path_length) + + return min(distances) + + def get_straight_shortest_path_points(self, position_a, position_b): + index_a = self._position_to_index(position_a) + index_b = self._position_to_index(position_b) + assert index_a is not None and index_b is not None + + shortest_path = nx.shortest_path(self.graph, source=index_a, target=index_b) + points = list() + for node in shortest_path: + points.append(self.graph.nodes()[node]['point']) + return points + + def compute_oracle_actions(self): + start_node = self._receiver_position_index + end_node = self._source_position_index + shortest_path = nx.shortest_path(self.graph, source=start_node, target=end_node) + assert shortest_path[0] == start_node and shortest_path[-1] == end_node + logging.debug(shortest_path) + + oracle_actions = [] + orientation = self.get_orientation() + for i in range(len(shortest_path) - 1): + prev_node = shortest_path[i] + next_node = shortest_path[i+1] + p1 = self.graph.nodes[prev_node]['point'] + p2 = self.graph.nodes[next_node]['point'] + direction = int(np.around(np.rad2deg(np.arctan2(p2[2] - p1[2], p2[0] - p1[0])))) % 360 + if direction == orientation: + pass + elif (direction - orientation) % 360 == 270: + orientation = (orientation - 90) % 360 + oracle_actions.append(HabitatSimActions.TURN_LEFT) + elif (direction - orientation) % 360 == 90: + orientation = (orientation + 90) % 360 + oracle_actions.append(HabitatSimActions.TURN_RIGHT) + elif (direction - orientation) % 360 == 180: + orientation = (orientation - 180) % 360 + oracle_actions.append(HabitatSimActions.TURN_RIGHT) + oracle_actions.append(HabitatSimActions.TURN_RIGHT) + oracle_actions.append(HabitatSimActions.MOVE_FORWARD) + oracle_actions.append(HabitatSimActions.STOP) + return oracle_actions + + def get_oracle_action(self): + return self._oracle_actions[self._episode_step_count] + + @property + def previous_step_collided(self): + return self._previous_step_collided + + def find_nearest_graph_node(self, target_pos): + from scipy.spatial import cKDTree + all_points = np.array([self.graph.nodes()[node]['point'] for node in self.graph.nodes()]) + kd_tree = cKDTree(all_points[:, [0, 2]]) + d, ind = kd_tree.query(target_pos[[0, 2]]) + return all_points[ind] + + def seed(self, seed): + self._sim.seed(seed) + + def get_observations_at( + self, + position: Optional[List[float]] = None, + rotation: Optional[List[float]] = None, + keep_agent_at_new_pose: bool = False, + ) -> Optional[Observations]: + current_state = self.get_agent_state() + if position is None or rotation is None: + success = True + else: + success = self.set_agent_state( + position, rotation, reset_sensors=False + ) + + if success: + sim_obs = self._sim.get_sensor_observations() + + self._prev_sim_obs = sim_obs + + observations = self._sensor_suite.get_observations(sim_obs) + if not keep_agent_at_new_pose: + self.set_agent_state( + current_state.position, + current_state.rotation, + reset_sensors=False, + ) + return observations + else: + return None diff --git a/soundspaces/tasks/__init__.py b/soundspaces/tasks/__init__.py new file mode 100644 index 0000000..9d637f2 --- /dev/null +++ b/soundspaces/tasks/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +from soundspaces.tasks.action_space import SoundspacesDialogSimV0ActionSpaceConfiguration +from soundspaces.simulator import SoundSpacesSim +from soundspaces.datasets.audionav_dataset import AudioNavDataset +from soundspaces.datasets.semantic_audionav_dataset import SemanticAudioNavDataset +from soundspaces.tasks.audionav_task import * +from soundspaces.tasks.semantic_audionav_task import * +from soundspaces.tasks.nav import * diff --git a/soundspaces/tasks/action_space.py b/soundspaces/tasks/action_space.py new file mode 100644 index 0000000..6c3e478 --- /dev/null +++ b/soundspaces/tasks/action_space.py @@ -0,0 +1,87 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import attr +import habitat_sim +from habitat.core.registry import registry +from habitat.core.simulator import ActionSpaceConfiguration +from habitat.core.embodied_task import SimulatorTaskAction +from habitat.sims.habitat_simulator.actions import HabitatSimActions, HabitatSimV0ActionSpaceConfiguration +from habitat_sim.agent.controls.controls import ActuationSpec + +from typing import Any, Dict, List, Optional, Type + +HabitatSimActions.extend_action_space("MOVE_BACKWARD") +HabitatSimActions.extend_action_space("MOVE_LEFT") +HabitatSimActions.extend_action_space("MOVE_RIGHT") + +@registry.register_action_space_configuration(name="move-all") +class MoveOnlySpaceConfiguration(ActionSpaceConfiguration): + def get(self): + return { + HabitatSimActions.STOP: habitat_sim.ActionSpec("stop"), + HabitatSimActions.MOVE_FORWARD: habitat_sim.ActionSpec( + "move_forward", + habitat_sim.ActuationSpec( + amount=self.config.FORWARD_STEP_SIZE + ), + ), + HabitatSimActions.MOVE_BACKWARD: habitat_sim.ActionSpec( + "move_backward", + habitat_sim.ActuationSpec( + amount=self.config.FORWARD_STEP_SIZE + ), + ), + HabitatSimActions.MOVE_RIGHT: habitat_sim.ActionSpec( + "move_right", + habitat_sim.ActuationSpec( + amount=self.config.FORWARD_STEP_SIZE + ), + ), + HabitatSimActions.MOVE_LEFT: habitat_sim.ActionSpec( + "move_left", + habitat_sim.ActuationSpec( + amount=self.config.FORWARD_STEP_SIZE + ), + ) + } + + +@attr.s(auto_attribs=True, slots=True) +class QueryActuationSpec(ActuationSpec): + # what should be the initial parameter?? + amount: float = 1.0 + +@registry.register_action_space_configuration(name="SoundspacesDialogActions-v0") +class SoundspacesDialogSimV0ActionSpaceConfiguration( + HabitatSimV0ActionSpaceConfiguration +): + def __init__(self, config): + super().__init__(config) + if not HabitatSimActions.has_action("QUERY"): + HabitatSimActions.extend_action_space("QUERY") + + def get(self): + config = super().get() + new_config = { + HabitatSimActions.QUERY: habitat_sim.ActionSpec( + "dialog_based_navigation", + QueryActuationSpec( + amount=self.config.QUERY_STEP + ), + ) + } + config.update(new_config) + + return config + +@registry.register_task_action +class QueryAction(SimulatorTaskAction): + def step(self, *args: Any, **kwargs: Any): + r"""This method is called from ``Env`` on each ``step``.""" + return self._sim.step(HabitatSimActions.QUERY) diff --git a/soundspaces/tasks/audionav_task.py b/soundspaces/tasks/audionav_task.py new file mode 100644 index 0000000..1f3b1e7 --- /dev/null +++ b/soundspaces/tasks/audionav_task.py @@ -0,0 +1,45 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Type, Union + +from habitat.config import Config +from habitat.core.dataset import Episode +from habitat.tasks.nav.nav import NavigationTask, Measure, EmbodiedTask +from habitat.core.registry import registry + + +@registry.register_task(name="AudioNav") +class AudioNavigationTask(NavigationTask): + def overwrite_sim_config( + self, sim_config: Any, episode: Type[Episode] + ) -> Any: + return merge_sim_episode_config(sim_config, episode) + + +def merge_sim_episode_config( + sim_config: Config, episode: Type[Episode] +) -> Any: + sim_config.defrost() + # here's where the scene update happens, extract the scene name out of the path + sim_config.SCENE = episode.scene_id + sim_config.freeze() + if ( + episode.start_position is not None + and episode.start_rotation is not None + ): + agent_name = sim_config.AGENTS[sim_config.DEFAULT_AGENT_ID] + agent_cfg = getattr(sim_config, agent_name) + agent_cfg.defrost() + agent_cfg.START_POSITION = episode.start_position + agent_cfg.START_ROTATION = episode.start_rotation + agent_cfg.GOAL_POSITION = episode.goals[0].position + agent_cfg.SOUND_ID = episode.info['sound'] + '.wav' + agent_cfg.IS_SET_START_STATE = True + agent_cfg.freeze() + return sim_config diff --git a/soundspaces/tasks/nav.py b/soundspaces/tasks/nav.py new file mode 100644 index 0000000..af4f3bf --- /dev/null +++ b/soundspaces/tasks/nav.py @@ -0,0 +1,847 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Type, Union +import logging + +import numpy as np +import torch +import cv2 +import librosa +from gym import spaces +from skimage.measure import block_reduce + +from habitat.config import Config +from habitat.core.dataset import Episode + +from habitat.tasks.nav.nav import DistanceToGoal, Measure, EmbodiedTask, Success +from habitat.core.registry import registry +from habitat.core.simulator import ( + Sensor, + SensorTypes, + Simulator, +) +from habitat.utils.geometry_utils import ( + quaternion_from_coeff, + quaternion_rotate_vector, +) +from habitat.tasks.utils import cartesian_to_polar +from soundspaces.mp3d_utils import CATEGORY_INDEX_MAPPING +from soundspaces.utils import convert_semantic_object_to_rgb +from soundspaces.mp3d_utils import HouseReader + + +@registry.register_sensor +class AudioGoalSensor(Sensor): + def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): + self._sim = sim + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "audiogoal" + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.PATH + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (2, self._sim.config.AUDIO.RIR_SAMPLING_RATE) + + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=sensor_shape, + dtype=np.float32, + ) + + def get_observation(self, *args: Any, observations, episode: Episode, **kwargs: Any): + return self._sim.get_current_audiogoal_observation() + + +@registry.register_sensor +class SpectrogramSensor(Sensor): + cls_uuid: str = "spectrogram" + def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): + self._sim = sim + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "spectrogram" + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.PATH + + def _get_observation_space(self, *args: Any, **kwargs: Any): + spectrogram = self.compute_spectrogram(np.ones((2, self._sim.config.AUDIO.RIR_SAMPLING_RATE))) + + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=spectrogram.shape, + dtype=np.float32, + ) + + @staticmethod + def compute_spectrogram(audio_data): + def compute_stft(signal): + n_fft = 512 + hop_length = 160 + win_length = 400 + stft = np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length, win_length=win_length)) + stft = block_reduce(stft, block_size=(4, 4), func=np.mean) + return stft + + channel1_magnitude = np.log1p(compute_stft(audio_data[0])) + channel2_magnitude = np.log1p(compute_stft(audio_data[1])) + spectrogram = np.stack([channel1_magnitude, channel2_magnitude], axis=-1) + + return spectrogram + + def get_observation(self, *args: Any, observations, episode: Episode, **kwargs: Any): + spectrogram = self._sim.get_current_spectrogram_observation(self.compute_spectrogram) + + return spectrogram + + +@registry.register_measure +class NormalizedDistanceToGoal(Measure): + r""" Distance to goal the episode ends + """ + + def __init__( + self, *args: Any, sim: Simulator, config: Config, **kwargs: Any + ): + self._start_end_episode_distance = None + self._sim = sim + self._config = config + + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "normalized_distance_to_goal" + + def reset_metric(self, *args: Any, episode, **kwargs: Any): + self._start_end_episode_distance = episode.info["geodesic_distance"] + self._metric = None + + def update_metric( + self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any + ): + distance_to_goal = task.measurements.measures[DistanceToGoal.cls_uuid].get_metric() + self._metric = distance_to_goal / self._start_end_episode_distance + + +@registry.register_sensor(name="Collision") +class Collision(Sensor): + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + super().__init__(config=config) + self._sim = sim + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "collision" + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=1, + shape=(1,), + dtype=bool + ) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + return [self._sim.previous_step_collided] + + +@registry.register_measure +class SNA(Measure): + r"""SPL (Success weighted by Path Length) + + ref: On Evaluation of Embodied Agents - Anderson et. al + https://arxiv.org/pdf/1807.06757.pdf + """ + + def __init__( + self, *args: Any, sim: Simulator, config: Config, **kwargs: Any + ): + self._start_end_num_action = None + self._agent_num_action = None + self._sim = sim + self._config = config + + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "sna" + + def reset_metric(self, *args: Any, episode, **kwargs: Any): + self._start_end_num_action = episode.info["num_action"] + self._agent_num_action = 0 + self._metric = None + + def update_metric( + self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any + ): + ep_success = task.measurements.measures[Success.cls_uuid].get_metric() + self._agent_num_action += 1 + + self._metric = ep_success * ( + self._start_end_num_action + / max( + self._start_end_num_action, self._agent_num_action + ) + ) + + +@registry.register_measure +class NA(Measure): + r""" Number of actions + + ref: On Evaluation of Embodied Agents - Anderson et. al + https://arxiv.org/pdf/1807.06757.pdf + """ + + def __init__( + self, *args: Any, sim: Simulator, config: Config, **kwargs: Any + ): + self._agent_num_action = None + self._sim = sim + self._config = config + + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "na" + + def reset_metric(self, *args: Any, episode, **kwargs: Any): + self._agent_num_action = 0 + self._metric = None + + def update_metric( + self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any + ): + self._agent_num_action += 1 + self._metric = self._agent_num_action + + +@registry.register_sensor(name="EgoMap") +class EgoMap(Sensor): + r"""Estimates the top-down occupancy based on current depth-map. + Args: + sim: reference to the simulator for calculating task observations. + config: contains the MAP_RESOLUTION, MAP_SIZE, HEIGHT_THRESH fields to + decide grid-size, extents of the projection, and the thresholds + for determining obstacles and explored space. + """ + + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + + super().__init__(config=config) + + # Map statistics + self.map_size = self.config.MAP_SIZE + self.map_res = self.config.MAP_RESOLUTION + + # Agent height for pointcloud transformation + self.sensor_height = self.config.POSITION[1] + + # Compute intrinsic matrix + hfov = float(self._sim.config.DEPTH_SENSOR.HFOV) * np.pi / 180 + self.intrinsic_matrix = np.array([[1 / np.tan(hfov / 2.), 0., 0., 0.], + [0., 1 / np.tan(hfov / 2.), 0., 0.], + [0., 0., 1, 0], + [0., 0., 0, 1]]) + self.inverse_intrinsic_matrix = np.linalg.inv(self.intrinsic_matrix) + + # Height thresholds for obstacles + self.height_thresh = self.config.HEIGHT_THRESH + + # Depth processing + self.min_depth = float(self._sim.config.DEPTH_SENSOR.MIN_DEPTH) + self.max_depth = float(self._sim.config.DEPTH_SENSOR.MAX_DEPTH) + + # Pre-compute a grid of locations for depth projection + W = self._sim.config.DEPTH_SENSOR.WIDTH + H = self._sim.config.DEPTH_SENSOR.HEIGHT + self.proj_xs, self.proj_ys = np.meshgrid( + np.linspace(-1, 1, W), + np.linspace(1, -1, H) + ) + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "ego_map" + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (self.config.MAP_SIZE, self.config.MAP_SIZE, 2) + return spaces.Box( + low=0, + high=1, + shape=sensor_shape, + dtype=np.uint8, + ) + + def convert_to_pointcloud(self, depth): + """ + Inputs: + depth = (H, W, 1) numpy array + Returns: + xyz_camera = (N, 3) numpy array for (X, Y, Z) in egocentric world coordinates + """ + + depth_float = depth.astype(np.float32)[..., 0] + + # =========== Convert to camera coordinates ============ + W = depth.shape[1] + xs = self.proj_xs.reshape(-1) + ys = self.proj_ys.reshape(-1) + depth_float = depth_float.reshape(-1) + + # Filter out invalid depths + max_forward_range = self.map_size * self.map_res + valid_depths = (depth_float != 0.0) & (depth_float <= max_forward_range) + xs = xs[valid_depths] + ys = ys[valid_depths] + depth_float = depth_float[valid_depths] + + # Unproject + # negate depth as the camera looks along -Z + xys = np.vstack((xs * depth_float, + ys * depth_float, + -depth_float, np.ones(depth_float.shape))) + inv_K = self.inverse_intrinsic_matrix + xyz_camera = np.matmul(inv_K, xys).T # XYZ in the camera coordinate system + xyz_camera = xyz_camera[:, :3] / xyz_camera[:, 3][:, np.newaxis] + + return xyz_camera + + def safe_assign(self, im_map, x_idx, y_idx, value): + try: + im_map[x_idx, y_idx] = value + except IndexError: + valid_idx1 = np.logical_and(x_idx >= 0, x_idx < im_map.shape[0]) + valid_idx2 = np.logical_and(y_idx >= 0, y_idx < im_map.shape[1]) + valid_idx = np.logical_and(valid_idx1, valid_idx2) + im_map[x_idx[valid_idx], y_idx[valid_idx]] = value + + def _get_depth_projection(self, sim_depth): + """ + Project pixels visible in depth-map to ground-plane + """ + + if self._sim.config.DEPTH_SENSOR.NORMALIZE_DEPTH: + depth = sim_depth * (self.max_depth - self.min_depth) + self.min_depth + else: + depth = sim_depth + + XYZ_ego = self.convert_to_pointcloud(depth) + + # Adding agent's height to the point cloud + XYZ_ego[:, 1] += self.sensor_height + + # Convert to grid coordinate system + V = self.map_size + Vby2 = V // 2 + points = XYZ_ego + + grid_x = (points[:, 0] / self.map_res) + Vby2 + grid_y = (points[:, 2] / self.map_res) + V + + # Filter out invalid points + valid_idx = (grid_x >= 0) & (grid_x <= V-1) & (grid_y >= 0) & (grid_y <= V-1) + points = points[valid_idx, :] + grid_x = grid_x[valid_idx].astype(int) + grid_y = grid_y[valid_idx].astype(int) + + # Create empty maps for the two channels + obstacle_mat = np.zeros((self.map_size, self.map_size), np.uint8) + explore_mat = np.zeros((self.map_size, self.map_size), np.uint8) + + # Compute obstacle locations + high_filter_idx = points[:, 1] < self.height_thresh[1] + low_filter_idx = points[:, 1] > self.height_thresh[0] + obstacle_idx = np.logical_and(low_filter_idx, high_filter_idx) + + self.safe_assign(obstacle_mat, grid_y[obstacle_idx], grid_x[obstacle_idx], 1) + + # Compute explored locations + explored_idx = high_filter_idx + self.safe_assign(explore_mat, grid_y[explored_idx], grid_x[explored_idx], 1) + + # Smoothen the maps + kernel = np.ones((3, 3), np.uint8) + + obstacle_mat = cv2.morphologyEx(obstacle_mat, cv2.MORPH_CLOSE, kernel) + explore_mat = cv2.morphologyEx(explore_mat, cv2.MORPH_CLOSE, kernel) + + # Ensure all expanded regions in obstacle_mat are accounted for in explored_mat + explore_mat = np.logical_or(explore_mat, obstacle_mat) + + return np.stack([obstacle_mat, explore_mat], axis=2) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + # convert to numpy array + ego_map_gt = self._sim.get_egomap_observation() + if ego_map_gt is None: + sim_depth = asnumpy(observations['depth']) + ego_map_gt = self._get_depth_projection(sim_depth) + self._sim.cache_egomap_observation(ego_map_gt) + + return ego_map_gt + + +def asnumpy(v): + if torch.is_tensor(v): + return v.cpu().numpy() + elif isinstance(v, np.ndarray): + return v + else: + raise ValueError('Invalid input') + + +@registry.register_sensor(name="Category") +class Category(Sensor): + cls_uuid: str = "category" + + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + super().__init__(config=config) + self._sim = sim + + def _get_uuid(self, *args: Any, **kwargs: Any): + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=1, + shape=(len(CATEGORY_INDEX_MAPPING.keys()),), + dtype=bool + ) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + index = CATEGORY_INDEX_MAPPING[episode.object_category] + onehot = np.zeros(len(CATEGORY_INDEX_MAPPING.keys())) + onehot[index] = 1 + + return onehot + + +@registry.register_sensor(name="CategoryBelief") +class CategoryBelief(Sensor): + cls_uuid: str = "category_belief" + + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + super().__init__(config=config) + self._sim = sim + + def _get_uuid(self, *args: Any, **kwargs: Any): + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=1, + shape=(len(CATEGORY_INDEX_MAPPING.keys()),), + dtype=bool + ) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + belief = np.zeros(len(CATEGORY_INDEX_MAPPING.keys())) + + return belief + + +@registry.register_sensor(name="LocationBelief") +class LocationBelief(Sensor): + cls_uuid: str = "location_belief" + + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + super().__init__(config=config) + self._sim = sim + + def _get_uuid(self, *args: Any, **kwargs: Any): + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=1, + shape=(2,), + dtype=bool + ) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + belief = np.zeros(2) + return belief + + +@registry.register_sensor(name="MPCAT40Index") +class MPCAT40Index(Sensor): + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + self.config = config + self._category_mapping = { + 'chair': 3, + 'table': 5, + 'picture': 6, + 'cabinet': 7, + 'cushion': 8, + 'sofa': 10, + 'bed': 11, + 'chest_of_drawers': 13, + 'plant': 14, + 'sink': 15, + 'toilet': 18, + 'stool': 19, + 'towel': 20, + 'tv_monitor': 22, + 'shower': 23, + 'bathtub': 25, + 'counter': 26, + 'fireplace': 27, + 'gym_equipment': 33, + 'seating': 34, + 'clothes': 38 + } + super().__init__(config=config) + self._sim = sim + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "mpcat40_index" + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=1, + shape=(1,), + dtype=bool + ) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + index = self._category_mapping[episode.object_category] + encoding = np.array([index]) + + return encoding + + +@registry.register_sensor(name="SemanticObjectSensor") +class SemanticObjectSensor(Sensor): + r"""Lists the object categories for each pixel location. + + Args: + sim: reference to the simulator for calculating task observations. + """ + cls_uuid: str = "semantic_object" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + self._current_episode_id = None + self.mapping = None + self._initialize_category_mappings() + + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any): + return self.cls_uuid + + def _initialize_category_mappings(self): + self.category_to_task_category_id = { + 'chair': 0, + 'table': 1, + 'picture': 2, + 'cabinet': 3, + 'cushion': 4, + 'sofa': 5, + 'bed': 6, + 'chest_of_drawers': 7, + 'plant': 8, + 'sink': 9, + 'toilet': 10, + 'stool': 11, + 'towel': 12, + 'tv_monitor': 13, + 'shower': 14, + 'bathtub': 15, + 'counter': 16, + 'fireplace': 17, + 'gym_equipment': 18, + 'seating': 19, + 'clothes': 20 + } + self.category_to_mp3d_category_id = { + 'chair': 3, + 'table': 5, + 'picture': 6, + 'cabinet': 7, + 'cushion': 8, + 'sofa': 10, + 'bed': 11, + 'chest_of_drawers': 13, + 'plant': 14, + 'sink': 15, + 'toilet': 18, + 'stool': 19, + 'towel': 20, + 'tv_monitor': 22, + 'shower': 23, + 'bathtub': 25, + 'counter': 26, + 'fireplace': 27, + 'gym_equipment': 33, + 'seating': 34, + 'clothes': 38 + } + self.num_task_categories = np.max( + list(self.category_to_task_category_id.values()) + ) + 1 + self.mp3d_id_to_task_id = np.ones((200, ), dtype=np.int64) * -1 + for k in self.category_to_task_category_id.keys(): + v1 = self.category_to_task_category_id[k] + v2 = self.category_to_mp3d_category_id[k] + self.mp3d_id_to_task_id[v2] = v1 + # Map unknown classes to a new category + self.mp3d_id_to_task_id[ + self.mp3d_id_to_task_id == -1 + ] = self.num_task_categories + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + if self.config.CONVERT_TO_RGB: + observation_space = spaces.Box( + low=0, + high=255, + shape=(self.config.HEIGHT, self.config.WIDTH, 3), + dtype=np.uint8, + ) + else: + observation_space = spaces.Box( + low=np.iinfo(np.uint32).min, + high=np.iinfo(np.uint32).max, + shape=(self.config.HEIGHT, self.config.WIDTH), + dtype=np.uint32, + ) + return observation_space + + def get_observation( + self, *args: Any, observations, episode, **kwargs: Any + ): + episode_uniq_id = f"{episode.scene_id} {episode.episode_id}" + if self._current_episode_id != episode_uniq_id: + self._current_episode_id = episode_uniq_id + reader = HouseReader(self._sim._current_scene.replace('.glb', '.house')) + instance_id_to_mp3d_id = reader.compute_object_to_category_index_mapping() + self.instance_id_to_mp3d_id = np.array([instance_id_to_mp3d_id[i] for i in range(len(instance_id_to_mp3d_id))]) + + # Pre-process semantic observations to remove invalid values + semantic = np.copy(observations["semantic"]) + semantic[semantic >= self.instance_id_to_mp3d_id.shape[0]] = 0 + # Map from instance id to semantic id + semantic_object = np.take(self.instance_id_to_mp3d_id, semantic) + # Map from semantic id to task id + semantic_object = np.take(self.mp3d_id_to_task_id, semantic_object) + if self.config.CONVERT_TO_RGB: + semantic_object = SemanticObjectSensor.convert_semantic_map_to_rgb( + semantic_object + ) + + return semantic_object + + @staticmethod + def convert_semantic_map_to_rgb(semantic_map): + return convert_semantic_object_to_rgb(semantic_map) + + +@registry.register_sensor(name="PoseSensor") +class PoseSensor(Sensor): + r"""The agents current location and heading in the coordinate frame defined by the + episode, i.e. the axis it faces along and the origin is defined by its state at + t=0. Additionally contains the time-step of the episode. + + Args: + sim: reference to the simulator for calculating task observations. + config: Contains the DIMENSIONALITY field for the number of dimensions to express the agents position + Attributes: + _dimensionality: number of dimensions used to specify the agents position + """ + cls_uuid: str = "pose" + + def __init__( + self, sim: Simulator, config: Config, *args: Any, **kwargs: Any + ): + self._sim = sim + self._episode_time = 0 + self._current_episode_id = None + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.POSITION + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=(4,), + dtype=np.float32, + ) + + def _quat_to_xy_heading(self, quat): + direction_vector = np.array([0, 0, -1]) + + heading_vector = quaternion_rotate_vector(quat, direction_vector) + + phi = cartesian_to_polar(-heading_vector[2], heading_vector[0])[1] + return np.array([phi], dtype=np.float32) + + def get_observation( + self, observations, episode, *args: Any, **kwargs: Any + ): + episode_uniq_id = f"{episode.scene_id} {episode.episode_id}" + if episode_uniq_id != self._current_episode_id: + self._episode_time = 0.0 + self._current_episode_id = episode_uniq_id + + agent_state = self._sim.get_agent_state() + + origin = np.array(episode.start_position, dtype=np.float32) + rotation_world_start = quaternion_from_coeff(episode.start_rotation) + + agent_position_xyz = agent_state.position + rotation_world_agent = agent_state.rotation + + agent_position_xyz = quaternion_rotate_vector( + rotation_world_start.inverse(), agent_position_xyz - origin + ) + + agent_heading = self._quat_to_xy_heading( + rotation_world_agent.inverse() * rotation_world_start + ) + + ep_time = self._episode_time + self._episode_time += 1.0 + + return np.array( + [-agent_position_xyz[2], agent_position_xyz[0], agent_heading, ep_time], + dtype=np.float32 + ) + + +@registry.register_sensor +class ProximitySensor(Sensor): + r"""Sensor for observing the distance to the closest obstacle + + Args: + sim: reference to the simulator for calculating task observations. + config: config for the sensor. + """ + cls_uuid: str = "proximity" + + def __init__(self, sim, config, *args: Any, **kwargs: Any): + self._sim = sim + self._max_detection_radius = getattr( + config, "MAX_DETECTION_RADIUS", 2.0 + ) + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.TACTILE + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0.0, + high=self._max_detection_radius, + shape=(1,), + dtype=np.float32, + ) + + def get_observation( + self, observations, *args: Any, episode, **kwargs: Any + ): + current_position = self._sim.get_agent_state().position + + return np.array( + [ + self._sim.distance_to_closest_obstacle( + current_position, self._max_detection_radius + ) + ], + dtype=np.float32, + ) + + +@registry.register_sensor +class OracleActionSensor(Sensor): + def __init__(self, *args: Any, sim: Simulator, config: Config, **kwargs: Any): + self._sim = sim + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "oracle_action_sensor" + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.PATH + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (1,) + + return spaces.Box( + low=np.finfo(np.float32).min, + high=np.finfo(np.float32).max, + shape=sensor_shape, + dtype=np.float32, + ) + + def get_observation(self, *args: Any, observations, episode: Episode, **kwargs: Any): + return self._sim.get_oracle_action() diff --git a/soundspaces/tasks/semantic_audiodialognav_task.py b/soundspaces/tasks/semantic_audiodialognav_task.py new file mode 100644 index 0000000..f776e09 --- /dev/null +++ b/soundspaces/tasks/semantic_audiodialognav_task.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +# Adapted from semantic_audionav_task.py + +# The MIT license below is in the original source at https://github.com/facebookresearch/sound-spaces/blob/main/soundspaces/tasks/semantic_audionav_task.py +# although the sound-spaces package is licensed as CC-BY-4.0 + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +from typing import Any, List, Optional, Type + +import attr +import numpy as np +from gym import spaces + +from habitat.config import Config +from habitat.core.dataset import Dataset, Episode +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.core.simulator import AgentState, Sensor, SensorTypes, Simulator +from habitat.tasks.nav.nav import Measure, EmbodiedTask, Success +from habitat.core.utils import not_none_validator +from habitat.tasks.nav.nav import ( + NavigationEpisode, + NavigationGoal, + NavigationTask, +) + + +@attr.s(auto_attribs=True, kw_only=True) +class SemanticAudioGoalDialogNavEpisode(NavigationEpisode): + r"""AudioGoal-Dialog Navigation Episode + """ + object_category: str + sound_id: str + distractor_sound_id: str = None + distractor_position_index: attr.ib(converter=int) = None + offset: attr.ib(converter=int) + duration: attr.ib(converter=int) + # extra info to add for dialog + dialog_node: str = attr.ib(default=None, validator=not_none_validator) + dialog_point: List[float] = attr.ib(default=None, validator=not_none_validator) + sub_instr: str = attr.ib(default=None, validator=not_none_validator) + dialog_rotation: List[float] = attr.ib(default=None, validator=not_none_validator) + + + @property + def goals_key(self) -> str: + r"""The key to retrieve the goals + """ + return f"{os.path.basename(self.scene_id)}_{self.object_category}" + + +@attr.s(auto_attribs=True) +class ObjectViewLocation: + r"""ObjectViewLocation provides information about a position around an object goal + usually that is navigable and the object is visible with specific agent + configuration that episode's dataset was created. + that is target for + navigation. That can be specify object_id, position and object + category. An important part for metrics calculation are view points that + describe success area for the navigation. + + Args: + agent_state: navigable AgentState with a position and a rotation where + the object is visible. + iou: an intersection of a union of the object and a rectangle in the + center of view. This metric is used to evaluate how good is the object + view form current position. Higher iou means better view, iou equals + 1.0 if whole object is inside of the rectangle and no pixel inside + the rectangle belongs to anything except the object. + """ + agent_state: AgentState + iou: Optional[float] + + +@attr.s(auto_attribs=True, kw_only=True) +class SemanticAudioGoal(NavigationGoal): + r"""Object goal provides information about an object that is target for + navigation. That can be specify object_id, position and object + category. An important part for metrics calculation are view points that + describe success area for the navigation. + + Args: + object_id: id that can be used to retrieve object from the semantic + scene annotation + object_name: name of the object + object_category: object category name usually similar to scene semantic + categories + room_id: id of a room where object is located, can be used to retrieve + room from the semantic scene annotation + room_name: name of the room, where object is located + view_points: navigable positions around the object with specified + proximity of the object surface used for navigation metrics calculation. + The object is visible from these positions. + """ + + object_id: str = attr.ib(default=None, validator=not_none_validator) + object_name: Optional[str] = None + object_category: Optional[str] = None + room_id: Optional[str] = None + room_name: Optional[str] = None + view_points: Optional[List[ObjectViewLocation]] = None + + +@registry.register_sensor +class SemanticAudioGoalDialogSensor(Sensor): + r"""A sensor for Object Goal specification as observations which is used in + ObjectGoal Navigation. The goal is expected to be specified by object_id or + semantic category id. + For the agent in simulator the forward direction is along negative-z. + In polar coordinate format the angle returned is azimuth to the goal. + Args: + sim: a reference to the simulator for calculating task observations. + config: a config for the ObjectGoalSensor sensor. Can contain field + GOAL_SPEC that specifies which id use for goal specification, + GOAL_SPEC_MAX_VAL the maximum object_id possible used for + observation space definition. + dataset: a Object Goal navigation ?? dataset that contains dictionaries + of categories id to text mapping. + """ + cls_uuid: str = "objectgoal" + + def __init__( + self, sim, config: Config, dataset: Dataset, *args: Any, **kwargs: Any + ): + self._sim = sim + self._dataset = dataset + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.SEMANTIC + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (1,) + max_value = (self.config.GOAL_SPEC_MAX_VAL - 1,) + if self.config.GOAL_SPEC == "TASK_CATEGORY_ID": + max_value = max( + self._dataset.category_to_task_category_id.values() + ) + + return spaces.Box( + low=0, high=max_value, shape=sensor_shape, dtype=np.int64 + ) + + def get_observation( + self, + observations, + *args: Any, + episode: SemanticAudioGoalDialogNavEpisode, + **kwargs: Any, + ) -> Optional[int]: + if self.config.GOAL_SPEC == "TASK_CATEGORY_ID": + if len(episode.goals) == 0: + logger.error( + f"No goal specified for episode {episode.episode_id}." + ) + return None + if not isinstance(episode.goals[0], SemanticAudioGoal): + logger.error( + f"First goal should be ObjectGoal, episode {episode.episode_id}." + ) + return None + category_name = episode.object_category + return np.array( + [self._dataset.category_to_task_category_id[category_name]], + dtype=np.int64, + ) + elif self.config.GOAL_SPEC == "OBJECT_ID": + return np.array([episode.goals[0].object_name_id], dtype=np.int64) + else: + raise RuntimeError( + "Wrong GOAL_SPEC specified for ObjectGoalSensor." + ) + + +@registry.register_task(name="SemanticAudioNav") +class SemanticAudioDialogNavigationTask(NavigationTask): + r"""An Object?? Navigation Task class for a task specific methods. + Used to explicitly state a type of the task in config. + """ + + def overwrite_sim_config( + self, sim_config: Any, episode: Type[Episode] + ) -> Any: + return merge_sim_episode_config(sim_config, episode) + + +def merge_sim_episode_config( + sim_config: Config, episode: Type[Episode] +) -> Any: + sim_config.defrost() + # here's where the scene update happens, extract the scene name out of the path + sim_config.SCENE = episode.scene_id + sim_config.freeze() + if ( + episode.start_position is not None + and episode.start_rotation is not None + ): + agent_name = sim_config.AGENTS[sim_config.DEFAULT_AGENT_ID] + agent_cfg = getattr(sim_config, agent_name) + agent_cfg.defrost() + agent_cfg.START_POSITION = episode.start_position + agent_cfg.START_ROTATION = episode.start_rotation + agent_cfg.GOAL_POSITION = episode.goals[0].position + agent_cfg.SOUND_ID = episode.sound_id + agent_cfg.DISTRACTOR_SOUND_ID = episode.distractor_sound_id + agent_cfg.DISTRACTOR_POSITION_INDEX = episode.distractor_position_index + agent_cfg.OFFSET = episode.offset + agent_cfg.DURATION = episode.duration + agent_cfg.IS_SET_START_STATE = True + + # for dialog + agent_cfg.DIALOG_NODE = episode.dialog_node + agent_cfg.DIALOG_POINT = episode.dialog_point + agent_cfg.SUB_INSTR = episode.sub_instr + agent_cfg.DIALOG_ROTATION = episode.dialog_rotation + agent_cfg.freeze() + return sim_config + + +@registry.register_measure +class SWS(Measure): + r"""Success when silent + """ + def __init__( + self, *args: Any, sim: Simulator, config: Config, **kwargs: Any + ): + self._sim = sim + self._config = config + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "sws" + + def reset_metric(self, *args: Any, episode, **kwargs: Any): + self._metric = None + + def update_metric( + self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any + ): + ep_success = task.measurements.measures[Success.cls_uuid].get_metric() + self._metric = ep_success * self._sim.is_silent diff --git a/soundspaces/tasks/semantic_audionav_task.py b/soundspaces/tasks/semantic_audionav_task.py new file mode 100644 index 0000000..5f32986 --- /dev/null +++ b/soundspaces/tasks/semantic_audionav_task.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +# The MIT license below is in the original source at https://github.com/facebookresearch/sound-spaces/blob/main/soundspaces/tasks/semantic_audionav_task.py +# although the sound-spaces package is licensed as CC-BY-4.0 + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import os +from typing import Any, List, Optional, Type + +import attr +import numpy as np +from gym import spaces +import sys + +from habitat.config import Config +from habitat.core.dataset import Dataset, Episode +from habitat.core.logging import logger +from habitat.core.registry import registry +from habitat.core.simulator import AgentState, Sensor, SensorTypes, Simulator +from habitat.tasks.nav.nav import Measure, EmbodiedTask, Success +from habitat.core.utils import not_none_validator +from habitat.tasks.nav.nav import ( + NavigationEpisode, + NavigationGoal, + NavigationTask, +) + + +@attr.s(auto_attribs=True, kw_only=True) +class SemanticAudioGoalNavEpisode(NavigationEpisode): + r"""ObjectGoal Navigation Episode + + :param object_category: Category of the object + """ + object_category: str + sound_id: str + distractor_sound_id: str = None + distractor_position_index: attr.ib(converter=int) = None + offset: attr.ib(converter=int) + duration: attr.ib(converter=int) + # dialog pretraining + dialog_node: str = attr.ib(default=None) + sub_instr: str = attr.ib(default=None) + # direction + direction: list =attr.ib(default=None) + # rotation_angle + rotation_angle: int =attr.ib(default=None) + + + @property + def goals_key(self) -> str: + r"""The key to retrieve the goals + """ + return f"{os.path.basename(self.scene_id)}_{self.object_category}" + + +@attr.s(auto_attribs=True) +class ObjectViewLocation: + r"""ObjectViewLocation provides information about a position around an object goal + usually that is navigable and the object is visible with specific agent + configuration that episode's dataset was created. + that is target for + navigation. That can be specify object_id, position and object + category. An important part for metrics calculation are view points that + describe success area for the navigation. + + Args: + agent_state: navigable AgentState with a position and a rotation where + the object is visible. + iou: an intersection of a union of the object and a rectangle in the + center of view. This metric is used to evaluate how good is the object + view form current position. Higher iou means better view, iou equals + 1.0 if whole object is inside of the rectangle and no pixel inside + the rectangle belongs to anything except the object. + """ + agent_state: AgentState + iou: Optional[float] + + +@attr.s(auto_attribs=True, kw_only=True) +class SemanticAudioGoal(NavigationGoal): + r"""Object goal provides information about an object that is target for + navigation. That can be specify object_id, position and object + category. An important part for metrics calculation are view points that + describe success area for the navigation. + + Args: + object_id: id that can be used to retrieve object from the semantic + scene annotation + object_name: name of the object + object_category: object category name usually similar to scene semantic + categories + room_id: id of a room where object is located, can be used to retrieve + room from the semantic scene annotation + room_name: name of the room, where object is located + view_points: navigable positions around the object with specified + proximity of the object surface used for navigation metrics calculation. + The object is visible from these positions. + """ + + object_id: str = attr.ib(default=None, validator=not_none_validator) + object_name: Optional[str] = None + object_category: Optional[str] = None + room_id: Optional[str] = None + room_name: Optional[str] = None + view_points: Optional[List[ObjectViewLocation]] = None + + +@registry.register_sensor +class SemanticAudioGoalSensor(Sensor): + r"""A sensor for Object Goal specification as observations which is used in + ObjectGoal Navigation. The goal is expected to be specified by object_id or + semantic category id. + For the agent in simulator the forward direction is along negative-z. + In polar coordinate format the angle returned is azimuth to the goal. + Args: + sim: a reference to the simulator for calculating task observations. + config: a config for the ObjectGoalSensor sensor. Can contain field + GOAL_SPEC that specifies which id use for goal specification, + GOAL_SPEC_MAX_VAL the maximum object_id possible used for + observation space definition. + dataset: a Object Goal navigation dataset that contains dictionaries + of categories id to text mapping. + """ + cls_uuid: str = "objectgoal" + + def __init__( + self, sim, config: Config, dataset: Dataset, *args: Any, **kwargs: Any + ): + self._sim = sim + self._dataset = dataset + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any) -> str: + return self.cls_uuid + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.SEMANTIC + + def _get_observation_space(self, *args: Any, **kwargs: Any): + sensor_shape = (1,) + max_value = (self.config.GOAL_SPEC_MAX_VAL - 1,) + if self.config.GOAL_SPEC == "TASK_CATEGORY_ID": + max_value = max( + self._dataset.category_to_task_category_id.values() + ) + + return spaces.Box( + low=0, high=max_value, shape=sensor_shape, dtype=np.int64 + ) + + def get_observation( + self, + observations, + *args: Any, + episode: SemanticAudioGoalNavEpisode, + **kwargs: Any, + ) -> Optional[int]: + if self.config.GOAL_SPEC == "TASK_CATEGORY_ID": + if len(episode.goals) == 0: + logger.error( + f"No goal specified for episode {episode.episode_id}." + ) + return None + if not isinstance(episode.goals[0], SemanticAudioGoal): + logger.error( + f"First goal should be ObjectGoal, episode {episode.episode_id}." + ) + return None + category_name = episode.object_category + return np.array( + [self._dataset.category_to_task_category_id[category_name]], + dtype=np.int64, + ) + elif self.config.GOAL_SPEC == "OBJECT_ID": + return np.array([episode.goals[0].object_name_id], dtype=np.int64) + else: + raise RuntimeError( + "Wrong GOAL_SPEC specified for ObjectGoalSensor." + ) + + +@registry.register_task(name="SemanticAudioNav") +class SemanticAudioNavigationTask(NavigationTask): + r"""An Object Navigation Task class for a task specific methods. + Used to explicitly state a type of the task in config. + """ + + def overwrite_sim_config( + self, sim_config: Any, episode: Type[Episode] + ) -> Any: + return merge_sim_episode_config(sim_config, episode) + + +def merge_sim_episode_config( + sim_config: Config, episode: Type[Episode] +) -> Any: + sim_config.defrost() + # here's where the scene update happens, extract the scene name out of the path + sim_config.SCENE = episode.scene_id + sim_config.freeze() + if ( + episode.start_position is not None + and episode.start_rotation is not None + ): + agent_name = sim_config.AGENTS[sim_config.DEFAULT_AGENT_ID] + agent_cfg = getattr(sim_config, agent_name) + agent_cfg.defrost() + agent_cfg.START_POSITION = episode.start_position + agent_cfg.START_ROTATION = episode.start_rotation + agent_cfg.GOAL_POSITION = episode.goals[0].position + + # edit (change agent_cfg.SOUND_ID based on heard or unhead sound) + # agent_cfg.SOUND_ID = 'test_heard/'+ episode.sound_id.split('/')[-1] + agent_cfg.SOUND_ID = episode.sound_id + agent_cfg.DISTRACTOR_SOUND_ID = episode.distractor_sound_id + agent_cfg.DISTRACTOR_POSITION_INDEX = episode.distractor_position_index + agent_cfg.OFFSET = episode.offset + agent_cfg.DURATION = episode.duration + agent_cfg.IS_SET_START_STATE = True + + # for dialog pretraining------------------------------ + agent_cfg.DIALOG_NODE = episode.dialog_node + agent_cfg.SUB_INSTR = episode.sub_instr + + agent_cfg.freeze() + return sim_config + + +@registry.register_measure +class SWS(Measure): + r"""Success when silent + """ + def __init__( + self, *args: Any, sim: Simulator, config: Config, **kwargs: Any + ): + self._sim = sim + self._config = config + super().__init__() + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "sws" + + def reset_metric(self, *args: Any, episode, **kwargs: Any): + self._metric = None + + def update_metric( + self, *args: Any, episode, action, task: EmbodiedTask, **kwargs: Any + ): + ep_success = task.measurements.measures[Success.cls_uuid].get_metric() + self._metric = ep_success * self._sim.is_silent diff --git a/soundspaces/utils.py b/soundspaces/utils.py new file mode 100644 index 0000000..47bdf6e --- /dev/null +++ b/soundspaces/utils.py @@ -0,0 +1,59 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +import pickle +import numpy as np +import torch +from PIL import Image +from habitat_sim.utils.common import d3_40_colors_rgb + + +def load_metadata(parent_folder): + #print('parent_folder', parent_folder) + points_file = os.path.join(parent_folder, 'points.txt') + #print('points_file', points_file) + if "replica" in parent_folder: + graph_file = os.path.join(parent_folder, 'graph.pkl') + points_data = np.loadtxt(points_file, delimiter="\t") + points = list(zip( + points_data[:, 1], + points_data[:, 3] - 1.5528907, + -points_data[:, 2]) + ) + else: + graph_file = os.path.join(parent_folder, 'graph.pkl') + points_data = np.loadtxt(points_file, delimiter="\t") + points = list(zip( + points_data[:, 1], + points_data[:, 3] - 1.5, + -points_data[:, 2]) + ) + if not os.path.exists(graph_file): + raise FileExistsError(graph_file + ' does not exist!') + else: + with open(graph_file, 'rb') as fo: + graph = pickle.load(fo) + + return points, graph + + +def _to_tensor(v): + if torch.is_tensor(v): + return v + elif isinstance(v, np.ndarray): + return torch.from_numpy(v) + else: + return torch.tensor(v, dtype=torch.float) + + +def convert_semantic_object_to_rgb(x): + semantic_img = Image.new("P", (x.shape[1], x.shape[0])) + semantic_img.putpalette(d3_40_colors_rgb.flatten()) + semantic_img.putdata((x.flatten() % 40).astype(np.uint8)) + semantic_img = np.array(semantic_img.convert("RGB")) + return semantic_img diff --git a/ss_baselines/__init__.py b/ss_baselines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/av_nav/README.md b/ss_baselines/av_nav/README.md new file mode 100644 index 0000000..adce9e4 --- /dev/null +++ b/ss_baselines/av_nav/README.md @@ -0,0 +1,25 @@ +# Audio-Visual Navigation (AV-Nav) Model + +## Details +This folder provides the code of the model as well as the training/evaluation configurations used in the +[SoundSpaces: Audio-Visual Navigation in 3D Environments](https://arxiv.org/pdf/1912.11474.pdf) paper. +Use of this model is the same as described in the usage section of the main README file. +Pretrained weights are provided. + +## Evaluating pretrained model +``` +py ss_baselines/av_nav/run.py --run-type eval --exp-config ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_depth.yaml EVAL_CKPT_PATH_DIR data/pretrained_weights/audionav/av_nav/replica/heard.pth +py ss_baselines/av_nav/run.py --run-type eval --exp-config ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_depth.yaml EVAL_CKPT_PATH_DIR data/pretrained_weights/audionav/av_nav/replica/unheard.pth EVAL.SPLIT test_multiple_unheard +``` + + +## Citation +If you use this model in your research, please cite the following paper: +``` +@inproceedings{chen20soundspaces, + title = {SoundSpaces: Audio-Visual Navigaton in 3D Environments, + author = {Changan Chen and Unnat Jain and Carl Schissler and Sebastia Vicenc Amengual Gari and Ziad Al-Halah and Vamsi Krishna Ithapu and Philip Robinson and Kristen Grauman}, + booktitle = {ECCV}, + year = {2020} +} +``` \ No newline at end of file diff --git a/ss_baselines/av_nav/__init__.py b/ss_baselines/av_nav/__init__.py new file mode 100644 index 0000000..60472c0 --- /dev/null +++ b/ss_baselines/av_nav/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from ss_baselines.av_nav.ppo.ppo_trainer import PPOTrainer diff --git a/ss_baselines/av_nav/config/__init__.py b/ss_baselines/av_nav/config/__init__.py new file mode 100644 index 0000000..c4c1360 --- /dev/null +++ b/ss_baselines/av_nav/config/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +from ss_baselines.av_nav.config.default import get_task_config, get_config diff --git a/ss_baselines/av_nav/config/audionav/mp3d/interactive_demo.yaml b/ss_baselines/av_nav/config/audionav/mp3d/interactive_demo.yaml new file mode 100644 index 0000000..d03887b --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/interactive_demo.yaml @@ -0,0 +1,45 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/interactive_demo.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "AudioNavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +EVAL_CKPT_PATH_DIR: "data" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +CHECKPOINT_FOLDER: "data" +# number of times updating the ppo agent +NUM_UPDATES: 300000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 + +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_blind.yaml new file mode 100644 index 0000000..167a139 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_depth.yaml new file mode 100644 index 0000000..7dd63ed --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_rgb.yaml new file mode 100644 index 0000000..38c3b32 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiogoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_blind.yaml new file mode 100644 index 0000000..e1b9f71 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_depth.yaml new file mode 100644 index 0000000..ea703df --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_rgb.yaml new file mode 100644 index 0000000..d5d4de3 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/audiopointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_blind.yaml new file mode 100644 index 0000000..d4a6063 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_depth.yaml new file mode 100644 index 0000000..83da138 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_rgb.yaml new file mode 100644 index 0000000..5ecb0b5 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/test_telephone/pointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_blind.yaml new file mode 100644 index 0000000..7cc17d0 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_blind.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 10 +SENSORS: [] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_depth.yaml new file mode 100644 index 0000000..96fd242 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_depth.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 10 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_rgb.yaml new file mode 100644 index 0000000..531476a --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiogoal_rgb.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 10 +SENSORS: ["RGB_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_blind.yaml new file mode 100644 index 0000000..9082b33 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_blind.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 10 +SENSORS: [] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_depth.yaml new file mode 100644 index 0000000..88459a1 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_depth.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 10 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_rgb.yaml new file mode 100644 index 0000000..bc5a111 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/audiopointgoal_rgb.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 10 +SENSORS: ["RGB_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_blind.yaml new file mode 100644 index 0000000..a73a594 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_blind.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 10 +SENSORS: [] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_depth.yaml new file mode 100644 index 0000000..db21c19 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_depth.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 10 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_rgb.yaml new file mode 100644 index 0000000..87e444f --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_rgb.yaml @@ -0,0 +1,29 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 10 +SENSORS: ["RGB_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_rgb_question.yaml b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_rgb_question.yaml new file mode 100644 index 0000000..634b0e0 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/train_telephone/pointgoal_rgb_question.yaml @@ -0,0 +1,29 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal_question.yaml" +NUM_PROCESSES: 10 +SENSORS: ["RGB_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_blind.yaml new file mode 100644 index 0000000..3a5d037 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_depth.yaml new file mode 100644 index 0000000..665d546 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_rgb.yaml new file mode 100644 index 0000000..f4758b2 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiogoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_blind.yaml new file mode 100644 index 0000000..fe81af6 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_depth.yaml new file mode 100644 index 0000000..312377a --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_rgb.yaml new file mode 100644 index 0000000..7df0c6c --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/audiopointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_blind.yaml new file mode 100644 index 0000000..8a0e622 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_depth.yaml new file mode 100644 index 0000000..28f56f9 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_rgb.yaml new file mode 100644 index 0000000..2002a0b --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/mp3d/val_telephone/pointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/mp3d/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/interactive_demo.yaml b/ss_baselines/av_nav/config/audionav/replica/interactive_demo.yaml new file mode 100644 index 0000000..b95d04a --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/interactive_demo.yaml @@ -0,0 +1,45 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/interactive_demo.yaml" +TRAINER_NAME: "ppo" +ENV_NAME: "AudioNavRLEnv" +SIMULATOR_GPU_ID: 0 +TORCH_GPU_ID: 0 +TENSORBOARD_DIR: "tb" +VIDEO_DIR: "video_dir" +EVAL_CKPT_PATH_DIR: "data" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +CHECKPOINT_FOLDER: "data" +# number of times updating the ppo agent +NUM_UPDATES: 300000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 + +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_blind.yaml new file mode 100644 index 0000000..8476e0f --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_depth.yaml new file mode 100644 index 0000000..e215bcd --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_rgb.yaml new file mode 100644 index 0000000..6923abf --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiogoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_blind.yaml new file mode 100644 index 0000000..76f75e1 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_depth.yaml new file mode 100644 index 0000000..906d9c9 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_rgb.yaml new file mode 100644 index 0000000..a54fa37 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/audiopointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_blind.yaml new file mode 100644 index 0000000..936a073 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_depth.yaml new file mode 100644 index 0000000..d52d198 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_rgb.yaml new file mode 100644 index 0000000..30b1be5 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/test_telephone/pointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 1000 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_blind.yaml new file mode 100644 index 0000000..587ce5d --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_blind.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 5 +SENSORS: [] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_depth.yaml new file mode 100644 index 0000000..68ecf42 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_depth.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_rgb.yaml new file mode 100644 index 0000000..21a51e1 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiogoal_rgb.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["RGB_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_blind.yaml new file mode 100644 index 0000000..f327f07 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_blind.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 5 +SENSORS: [] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_depth.yaml new file mode 100644 index 0000000..61bfc35 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_depth.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_rgb.yaml new file mode 100644 index 0000000..db1855e --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/audiopointgoal_rgb.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["RGB_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_blind.yaml new file mode 100644 index 0000000..1954528 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_blind.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 5 +SENSORS: [] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_depth.yaml new file mode 100644 index 0000000..a059820 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_depth.yaml @@ -0,0 +1,30 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_rgb.yaml new file mode 100644 index 0000000..c3bb7ef --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/train_telephone/pointgoal_rgb.yaml @@ -0,0 +1,29 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["RGB_SENSOR"] +NUM_UPDATES: 40000 +LOG_INTERVAL: 10 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.20 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_blind.yaml new file mode 100644 index 0000000..8a9a7e8 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_depth.yaml new file mode 100644 index 0000000..9abc307 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_rgb.yaml new file mode 100644 index 0000000..9aba858 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiogoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_blind.yaml new file mode 100644 index 0000000..614c23a --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_depth.yaml new file mode 100644 index 0000000..5cb088d --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_rgb.yaml new file mode 100644 index 0000000..4a696f6 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/audiopointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/audiopointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_blind.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_blind.yaml new file mode 100644 index 0000000..ade4eb8 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_blind.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: [] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_depth.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_depth.yaml new file mode 100644 index 0000000..e584cff --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_depth.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_rgb.yaml b/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_rgb.yaml new file mode 100644 index 0000000..664d7d8 --- /dev/null +++ b/ss_baselines/av_nav/config/audionav/replica/val_telephone/pointgoal_rgb.yaml @@ -0,0 +1,10 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_nav/replica/pointgoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["RGB_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_nav/config/default.py b/ss_baselines/av_nav/config/default.py new file mode 100644 index 0000000..480e9c9 --- /dev/null +++ b/ss_baselines/av_nav/config/default.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Union +import os +import logging +import shutil + +import numpy as np + +from habitat import get_config as get_task_config +from habitat.config import Config as CN +import habitat + +DEFAULT_CONFIG_DIR = "configs/" +CONFIG_FILE_SEPARATOR = "," +# ----------------------------------------------------------------------------- +# EXPERIMENT CONFIG +# ----------------------------------------------------------------------------- +_C = CN() +_C.SEED = 0 +_C.BASE_TASK_CONFIG_PATH = "configs/tasks/pointgoal.yaml" +_C.TASK_CONFIG = CN() # task_config will be stored as a config node +_C.CMD_TRAILING_OPTS = [] # store command line options as list of strings +_C.TRAINER_NAME = "AVNavTrainer" +_C.ENV_NAME = "AudioNavRLEnv" +_C.SIMULATOR_GPU_ID = 0 +_C.TORCH_GPU_ID = 0 +_C.VIDEO_OPTION = ["disk", "tensorboard"] +_C.VISUALIZATION_OPTION = ["top_down_map"] +_C.TENSORBOARD_DIR = "tb" +_C.VIDEO_DIR = "video_dir" +_C.TEST_EPISODE_COUNT = 2 +_C.EVAL_CKPT_PATH_DIR = "data/checkpoints" # path to ckpt or path to ckpts dir +_C.NUM_PROCESSES = 16 +_C.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] +_C.CHECKPOINT_FOLDER = "data/checkpoints" +_C.NUM_UPDATES = 10000 +_C.LOG_INTERVAL = 10 +_C.LOG_FILE = "train.log" +_C.CHECKPOINT_INTERVAL = 50 +_C.USE_VECENV = True +_C.USE_SYNC_VECENV = False +_C.EXTRA_RGB = False +_C.DEBUG = False +_C.USE_LAST_CKPT = False +_C.DISPLAY_RESOLUTION = 128 +# ----------------------------------------------------------------------------- +# EVAL CONFIG +# ----------------------------------------------------------------------------- +_C.EVAL = CN() +# The split to evaluate on +_C.EVAL.SPLIT = "val" +_C.EVAL.USE_CKPT_CONFIG = True +# ----------------------------------------------------------------------------- +# REINFORCEMENT LEARNING (RL) ENVIRONMENT CONFIG +# ----------------------------------------------------------------------------- +_C.RL = CN() +_C.RL.SUCCESS_REWARD = 10.0 +_C.RL.SLACK_REWARD = -0.01 +_C.RL.WITH_TIME_PENALTY = True +_C.RL.WITH_DISTANCE_REWARD = True +_C.RL.DISTANCE_REWARD_SCALE = 1.0 +_C.RL.TIME_DIFF = False +# ----------------------------------------------------------------------------- +# PROXIMAL POLICY OPTIMIZATION (PPO) +# ----------------------------------------------------------------------------- +_C.RL.PPO = CN() +_C.RL.PPO.clip_param = 0.2 +_C.RL.PPO.ppo_epoch = 4 +_C.RL.PPO.num_mini_batch = 16 +_C.RL.PPO.value_loss_coef = 0.5 +_C.RL.PPO.entropy_coef = 0.01 +_C.RL.PPO.lr = 7e-4 +_C.RL.PPO.eps = 1e-5 +_C.RL.PPO.max_grad_norm = 0.5 +_C.RL.PPO.num_steps = 5 +_C.RL.PPO.hidden_size = 512 +_C.RL.PPO.use_gae = True +_C.RL.PPO.use_linear_lr_decay = False +_C.RL.PPO.use_linear_clip_decay = False +_C.RL.PPO.gamma = 0.99 +_C.RL.PPO.tau = 0.95 +_C.RL.PPO.reward_window_size = 50 +# ----------------------------------------------------------------------------- +# TASK CONFIG +# ----------------------------------------------------------------------------- +_TC = habitat.get_config() +_TC.defrost() +# ----------------------------------------------------------------------------- +# AUDIOGOAL_SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.AUDIOGOAL_SENSOR = CN() +_TC.TASK.AUDIOGOAL_SENSOR.TYPE = "AudioGoalSensor" +# ----------------------------------------------------------------------------- +# SPECTROGRAM_SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.SPECTROGRAM_SENSOR = CN() +_TC.TASK.SPECTROGRAM_SENSOR.TYPE = "SpectrogramSensor" +# ----------------------------------------------------------------------------- +# soundspaces +# ----------------------------------------------------------------------------- +_TC.SIMULATOR.GRID_SIZE = 0.5 +_TC.SIMULATOR.CONTINUOUS_VIEW_CHANGE = False +_TC.SIMULATOR.VIEW_CHANGE_FPS = 10 +_TC.SIMULATOR.SCENE_DATASET = 'replica' +_TC.SIMULATOR.USE_RENDERED_OBSERVATIONS = True +_TC.SIMULATOR.SCENE_OBSERVATION_DIR = 'data/scene_observations' +_TC.SIMULATOR.AUDIO = CN() +_TC.SIMULATOR.AUDIO.SCENE = "" +_TC.SIMULATOR.AUDIO.BINAURAL_RIR_DIR = "data/binaural_rirs" +_TC.SIMULATOR.AUDIO.RIR_SAMPLING_RATE = 44100 +_TC.SIMULATOR.AUDIO.SOURCE_SOUND_DIR = "data/sounds/1s_all" +_TC.SIMULATOR.AUDIO.METADATA_DIR = "data/metadata" +_TC.SIMULATOR.AUDIO.POINTS_FILE = 'points.txt' +_TC.SIMULATOR.AUDIO.GRAPH_FILE = 'graph.pkl' +_TC.SIMULATOR.AUDIO.HAS_DISTRACTOR_SOUND = False +_TC.SIMULATOR.AUDIO.EVERLASTING = True +# ----------------------------------------------------------------------------- +# DistanceToGoal Measure +# ----------------------------------------------------------------------------- +_TC.TASK.DISTANCE_TO_GOAL = CN() +_TC.TASK.DISTANCE_TO_GOAL.TYPE = "DistanceToGoal" +_TC.TASK.DISTANCE_TO_GOAL.DISTANCE_TO = "POINT" +_TC.TASK.NORMALIZED_DISTANCE_TO_GOAL = CN() +_TC.TASK.NORMALIZED_DISTANCE_TO_GOAL.TYPE = "NormalizedDistanceToGoal" +# ----------------------------------------------------------------------------- +# Dataset extension +# ----------------------------------------------------------------------------- +_TC.DATASET.VERSION = 'v1' +# ----------------------------------------------------------------------------- +# NumberOfAction Measure +# ----------------------------------------------------------------------------- +# ----------------------------------------------------------------------------- +_TC.TASK.NUM_ACTION = CN() +_TC.TASK.NUM_ACTION.TYPE = "NA" +_TC.TASK.SUCCESS_WEIGHTED_BY_NUM_ACTION = CN() +_TC.TASK.SUCCESS_WEIGHTED_BY_NUM_ACTION.TYPE = "SNA" + + +def merge_from_path(config, config_paths): + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + return config + + +def get_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None, + model_dir: Optional[str] = None, + run_type: Optional[str] = None, + overwrite: bool = False +) -> CN: + r"""Create a unified config with default values overwritten by values from + `config_paths` and overwritten by options from `opts`. + Args: + config_paths: List of config paths or string that contains comma + separated list of config paths. + opts: Config options (keys, values) in a list (e.g., passed from + command line into the config. For example, `opts = ['FOO.BAR', + 0.5]`. Argument can be used for parameter sweeping or quick tests. + model_dir: suffix for output dirs + run_type: either train or eval + """ + config = merge_from_path(_C.clone(), config_paths) + config.TASK_CONFIG = get_task_config(config_paths=config.BASE_TASK_CONFIG_PATH) + + # config_name = os.path.basename(config_paths).split('.')[0] + if model_dir is None: + model_dir = 'data/models/output' + config.TENSORBOARD_DIR = os.path.join(model_dir, 'tb') + config.CHECKPOINT_FOLDER = os.path.join(model_dir, 'data') + config.VIDEO_DIR = os.path.join(model_dir, 'video_dir') + config.LOG_FILE = os.path.join(model_dir, 'train.log') + config.EVAL_CKPT_PATH_DIR = os.path.join(model_dir, 'data') + + if opts: + config.CMD_TRAILING_OPTS = opts + config.merge_from_list(opts) + + dirs = [config.VIDEO_DIR, config.TENSORBOARD_DIR, config.CHECKPOINT_FOLDER] + if run_type == 'train': + # check dirs + if any([os.path.exists(d) for d in dirs]): + for d in dirs: + if os.path.exists(d): + print('{} exists'.format(d)) + if overwrite or input('Output directory already exists! Overwrite the folder? (y/n)') == 'y': + for d in dirs: + if os.path.exists(d): + shutil.rmtree(d) + + config.TASK_CONFIG.defrost() + config.TASK_CONFIG.SIMULATOR.USE_SYNC_VECENV = config.USE_SYNC_VECENV + config.TASK_CONFIG.SIMULATOR.FORWARD_STEP_SIZE = config.TASK_CONFIG.SIMULATOR.GRID_SIZE + config.TASK_CONFIG.freeze() + config.freeze() + return config + + +def get_task_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None +) -> habitat.Config: + config = _TC.clone() + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + + if opts: + config.merge_from_list(opts) + + config.freeze() + return config diff --git a/ss_baselines/av_nav/models/__init__.py b/ss_baselines/av_nav/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/av_nav/models/audio_cnn.py b/ss_baselines/av_nav/models/audio_cnn.py new file mode 100644 index 0000000..d9868d3 --- /dev/null +++ b/ss_baselines/av_nav/models/audio_cnn.py @@ -0,0 +1,89 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import torch.nn as nn + +from ss_baselines.common.utils import Flatten +from ss_baselines.av_nav.models.visual_cnn import conv_output_dim, layer_init + + +class AudioCNN(nn.Module): + r"""A Simple 3-Conv CNN for processing audio spectrogram features + + Args: + observation_space: The observation_space of the agent + output_size: The size of the embedding vector + """ + + def __init__(self, observation_space, output_size, audiogoal_sensor): + super(AudioCNN, self).__init__() + self._n_input_audio = observation_space.spaces[audiogoal_sensor].shape[2] + self._audiogoal_sensor = audiogoal_sensor + + cnn_dims = np.array( + observation_space.spaces[audiogoal_sensor].shape[:2], dtype=np.float32 + ) + + if cnn_dims[0] < 30 or cnn_dims[1] < 30: + self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)] + self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)] + else: + self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] + self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] + + for kernel_size, stride in zip( + self._cnn_layers_kernel_size, self._cnn_layers_stride + ): + cnn_dims = conv_output_dim( + dimension=cnn_dims, + padding=np.array([0, 0], dtype=np.float32), + dilation=np.array([1, 1], dtype=np.float32), + kernel_size=np.array(kernel_size, dtype=np.float32), + stride=np.array(stride, dtype=np.float32), + ) + + self.cnn = nn.Sequential( + nn.Conv2d( + in_channels=self._n_input_audio, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[0], + stride=self._cnn_layers_stride[0], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=32, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[1], + stride=self._cnn_layers_stride[1], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=64, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[2], + stride=self._cnn_layers_stride[2], + ), + # nn.ReLU(True), + Flatten(), + nn.Linear(64 * cnn_dims[0] * cnn_dims[1], output_size), + nn.ReLU(True), + ) + + layer_init(self.cnn) + + def forward(self, observations): + cnn_input = [] + + audio_observations = observations[self._audiogoal_sensor] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + audio_observations = audio_observations.permute(0, 3, 1, 2) + cnn_input.append(audio_observations) + + cnn_input = torch.cat(cnn_input, dim=1) + + return self.cnn(cnn_input) diff --git a/ss_baselines/av_nav/models/rnn_state_encoder.py b/ss_baselines/av_nav/models/rnn_state_encoder.py new file mode 100644 index 0000000..42c7958 --- /dev/null +++ b/ss_baselines/av_nav/models/rnn_state_encoder.py @@ -0,0 +1,149 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +class RNNStateEncoder(nn.Module): + def __init__( + self, + input_size: int, + hidden_size: int, + num_layers: int = 1, + rnn_type: str = "GRU", + ): + r"""An RNN for encoding the state in RL. + + Supports masking the hidden state during various timesteps in the forward lass + + Args: + input_size: The input size of the RNN + hidden_size: The hidden size + num_layers: The number of recurrent layers + rnn_type: The RNN cell type. Must be GRU or LSTM + """ + + super().__init__() + self._num_recurrent_layers = num_layers + self._rnn_type = rnn_type + + self.rnn = getattr(nn, rnn_type)( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + ) + + self.layer_init() + + def layer_init(self): + for name, param in self.rnn.named_parameters(): + if "weight" in name: + nn.init.orthogonal_(param) + elif "bias" in name: + nn.init.constant_(param, 0) + + @property + def num_recurrent_layers(self): + return self._num_recurrent_layers * ( + 2 if "LSTM" in self._rnn_type else 1 + ) + + def _pack_hidden(self, hidden_states): + if "LSTM" in self._rnn_type: + hidden_states = torch.cat( + [hidden_states[0], hidden_states[1]], dim=0 + ) + + return hidden_states + + def _unpack_hidden(self, hidden_states): + if "LSTM" in self._rnn_type: + hidden_states = ( + hidden_states[0 : self._num_recurrent_layers], + hidden_states[self._num_recurrent_layers :], + ) + + return hidden_states + + def _mask_hidden(self, hidden_states, masks): + if isinstance(hidden_states, tuple): + hidden_states = tuple(v * masks for v in hidden_states) + else: + hidden_states = masks * hidden_states + + return hidden_states + + def single_forward(self, x, hidden_states, masks): + r"""Forward for a non-sequence input + """ + hidden_states = self._unpack_hidden(hidden_states) + x, hidden_states = self.rnn( + x.unsqueeze(0), + self._mask_hidden(hidden_states, masks.unsqueeze(0)), + ) + x = x.squeeze(0) + hidden_states = self._pack_hidden(hidden_states) + return x, hidden_states + + def seq_forward(self, x, hidden_states, masks): + r"""Forward for a sequence of length T + + Args: + x: (T, N, -1) Tensor that has been flattened to (T * N, -1) + hidden_states: The starting hidden state. + masks: The masks to be applied to hidden state at every timestep. + A (T, N) tensor flatten to (T * N) + """ + # x is a (T, N, -1) tensor flattened to (T * N, -1) + n = hidden_states.size(1) + t = int(x.size(0) / n) + + # unflatten + x = x.view(t, n, x.size(1)) + masks = masks.view(t, n) + + # steps in sequence which have zero for any agent. Assume t=0 has + # a zero in it. + has_zeros = (masks[1:] == 0.0).any(dim=-1).nonzero().squeeze().cpu() + + # +1 to correct the masks[1:] + if has_zeros.dim() == 0: + has_zeros = [has_zeros.item() + 1] # handle scalar + else: + has_zeros = (has_zeros + 1).numpy().tolist() + + # add t=0 and t=T to the list + has_zeros = [0] + has_zeros + [t] + + hidden_states = self._unpack_hidden(hidden_states) + outputs = [] + for i in range(len(has_zeros) - 1): + # process steps that don't have any zeros in masks together + start_idx = has_zeros[i] + end_idx = has_zeros[i + 1] + + rnn_scores, hidden_states = self.rnn( + x[start_idx:end_idx], + self._mask_hidden( + hidden_states, masks[start_idx].view(1, -1, 1) + ), + ) + + outputs.append(rnn_scores) + + # x is a (T, N, -1) tensor + x = torch.cat(outputs, dim=0) + x = x.view(t * n, -1) # flatten + + hidden_states = self._pack_hidden(hidden_states) + return x, hidden_states + + def forward(self, x, hidden_states, masks): + if x.size(0) == hidden_states.size(1): + return self.single_forward(x, hidden_states, masks) + else: + return self.seq_forward(x, hidden_states, masks) diff --git a/ss_baselines/av_nav/models/visual_cnn.py b/ss_baselines/av_nav/models/visual_cnn.py new file mode 100644 index 0000000..7109e9d --- /dev/null +++ b/ss_baselines/av_nav/models/visual_cnn.py @@ -0,0 +1,154 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import torch.nn as nn + +from ss_baselines.common.utils import Flatten + + +def conv_output_dim(dimension, padding, dilation, kernel_size, stride +): + r"""Calculates the output height and width based on the input + height and width to the convolution layer. + + ref: https://pytorch.org/docs/master/nn.html#torch.nn.Conv2d + """ + assert len(dimension) == 2 + out_dimension = [] + for i in range(len(dimension)): + out_dimension.append( + int( + np.floor( + ( + ( + dimension[i] + + 2 * padding[i] + - dilation[i] * (kernel_size[i] - 1) + - 1 + ) + / stride[i] + ) + + 1 + ) + ) + ) + return tuple(out_dimension) + + +def layer_init(cnn): + for layer in cnn: + if isinstance(layer, (nn.Conv2d, nn.Linear)): + nn.init.kaiming_normal_( + layer.weight, nn.init.calculate_gain("relu") + ) + if layer.bias is not None: + nn.init.constant_(layer.bias, val=0) + + +class VisualCNN(nn.Module): + r"""A Simple 3-Conv CNN followed by a fully connected layer + + Takes in observations and produces an embedding of the rgb and/or depth components + + Args: + observation_space: The observation_space of the agent + output_size: The size of the embedding vector + """ + + def __init__(self, observation_space, output_size, extra_rgb): + super().__init__() + if "rgb" in observation_space.spaces and not extra_rgb: + self._n_input_rgb = observation_space.spaces["rgb"].shape[2] + else: + self._n_input_rgb = 0 + + if "depth" in observation_space.spaces: + self._n_input_depth = observation_space.spaces["depth"].shape[2] + else: + self._n_input_depth = 0 + + # kernel size for different CNN layers + self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] + + # strides for different CNN layers + self._cnn_layers_stride = [(4, 4), (2, 2), (2, 2)] + + if self._n_input_rgb > 0: + cnn_dims = np.array( + observation_space.spaces["rgb"].shape[:2], dtype=np.float32 + ) + elif self._n_input_depth > 0: + cnn_dims = np.array( + observation_space.spaces["depth"].shape[:2], dtype=np.float32 + ) + + if self.is_blind: + self.cnn = nn.Sequential() + else: + for kernel_size, stride in zip( + self._cnn_layers_kernel_size, self._cnn_layers_stride + ): + cnn_dims = conv_output_dim( + dimension=cnn_dims, + padding=np.array([0, 0], dtype=np.float32), + dilation=np.array([1, 1], dtype=np.float32), + kernel_size=np.array(kernel_size, dtype=np.float32), + stride=np.array(stride, dtype=np.float32), + ) + + self.cnn = nn.Sequential( + nn.Conv2d( + in_channels=self._n_input_rgb + self._n_input_depth, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[0], + stride=self._cnn_layers_stride[0], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=32, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[1], + stride=self._cnn_layers_stride[1], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=64, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[2], + stride=self._cnn_layers_stride[2], + ), + # nn.ReLU(True), + Flatten(), + nn.Linear(64 * cnn_dims[0] * cnn_dims[1], output_size), + nn.ReLU(True), + ) + + layer_init(self.cnn) + + @property + def is_blind(self): + return self._n_input_rgb + self._n_input_depth == 0 + + def forward(self, observations): + cnn_input = [] + if self._n_input_rgb > 0: + rgb_observations = observations["rgb"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + rgb_observations = rgb_observations.permute(0, 3, 1, 2) + rgb_observations = rgb_observations / 255.0 # normalize RGB + cnn_input.append(rgb_observations) + + if self._n_input_depth > 0: + depth_observations = observations["depth"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + depth_observations = depth_observations.permute(0, 3, 1, 2) + cnn_input.append(depth_observations) + + cnn_input = torch.cat(cnn_input, dim=1) + + return self.cnn(cnn_input) diff --git a/ss_baselines/av_nav/ppo/__init__.py b/ss_baselines/av_nav/ppo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/av_nav/ppo/policy.py b/ss_baselines/av_nav/ppo/policy.py new file mode 100644 index 0000000..4ea270d --- /dev/null +++ b/ss_baselines/av_nav/ppo/policy.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +import abc + +import torch +import torch.nn as nn +from torchsummary import summary + +from ss_baselines.common.utils import CategoricalNet +from ss_baselines.av_nav.models.rnn_state_encoder import RNNStateEncoder +from ss_baselines.av_nav.models.visual_cnn import VisualCNN +from ss_baselines.av_nav.models.audio_cnn import AudioCNN + +DUAL_GOAL_DELIMITER = ',' + + +class Policy(nn.Module): + def __init__(self, net, dim_actions): + super().__init__() + self.net = net + self.dim_actions = dim_actions + + self.action_distribution = CategoricalNet( + self.net.output_size, self.dim_actions + ) + self.critic = CriticHead(self.net.output_size) + + def forward(self, *x): + raise NotImplementedError + + def act( + self, + observations, + rnn_hidden_states, + prev_actions, + masks, + deterministic=False, + ): + features, rnn_hidden_states = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + # print('Features: ', features.cpu().numpy()) + distribution = self.action_distribution(features) + # print('Distribution: ', distribution.logits.cpu().numpy()) + value = self.critic(features) + # print('Value: ', value.item()) + + if deterministic: + action = distribution.mode() + # print('Deterministic action: ', action.item()) + else: + action = distribution.sample() + # print('Sample action: ', action.item()) + + action_log_probs = distribution.log_probs(action) + + return value, action, action_log_probs, rnn_hidden_states + + def get_value(self, observations, rnn_hidden_states, prev_actions, masks): + features, _ = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + return self.critic(features) + + def evaluate_actions( + self, observations, rnn_hidden_states, prev_actions, masks, action + ): + features, rnn_hidden_states = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + distribution = self.action_distribution(features) + value = self.critic(features) + + action_log_probs = distribution.log_probs(action) + distribution_entropy = distribution.entropy().mean() + + return value, action_log_probs, distribution_entropy, rnn_hidden_states + + +class CriticHead(nn.Module): + def __init__(self, input_size): + super().__init__() + self.fc = nn.Linear(input_size, 1) + nn.init.orthogonal_(self.fc.weight) + nn.init.constant_(self.fc.bias, 0) + + def forward(self, x): + return self.fc(x) + + +class AudioNavBaselinePolicy(Policy): + def __init__( + self, + observation_space, + action_space, + goal_sensor_uuid, + hidden_size=512, + extra_rgb=False + ): + super().__init__( + AudioNavBaselineNet( + observation_space=observation_space, + hidden_size=hidden_size, + goal_sensor_uuid=goal_sensor_uuid, + extra_rgb=extra_rgb + ), + action_space.n, + ) + + +class Net(nn.Module, metaclass=abc.ABCMeta): + @abc.abstractmethod + def forward(self, observations, rnn_hidden_states, prev_actions, masks): + pass + + @property + @abc.abstractmethod + def output_size(self): + pass + + @property + @abc.abstractmethod + def num_recurrent_layers(self): + pass + + @property + @abc.abstractmethod + def is_blind(self): + pass + + +class AudioNavBaselineNet(Net): + r"""Network which passes the input image through CNN and concatenates + goal vector with CNN's output and passes that through RNN. + """ + + def __init__(self, observation_space, hidden_size, goal_sensor_uuid, extra_rgb=False): + super().__init__() + self.goal_sensor_uuid = goal_sensor_uuid + self._hidden_size = hidden_size + self._audiogoal = False + self._pointgoal = False + self._n_pointgoal = 0 + + if DUAL_GOAL_DELIMITER in self.goal_sensor_uuid: + goal1_uuid, goal2_uuid = self.goal_sensor_uuid.split(DUAL_GOAL_DELIMITER) + self._audiogoal = self._pointgoal = True + self._n_pointgoal = observation_space.spaces[goal1_uuid].shape[0] + else: + if 'pointgoal_with_gps_compass' == self.goal_sensor_uuid: + self._pointgoal = True + self._n_pointgoal = observation_space.spaces[self.goal_sensor_uuid].shape[0] + else: + self._audiogoal = True + + self.visual_encoder = VisualCNN(observation_space, hidden_size, extra_rgb) + if self._audiogoal: + if 'audiogoal' in self.goal_sensor_uuid: + audiogoal_sensor = 'audiogoal' + elif 'spectrogram' in self.goal_sensor_uuid: + audiogoal_sensor = 'spectrogram' + self.audio_encoder = AudioCNN(observation_space, hidden_size, audiogoal_sensor) + + rnn_input_size = (0 if self.is_blind else self._hidden_size) + \ + (self._n_pointgoal if self._pointgoal else 0) + (self._hidden_size if self._audiogoal else 0) + self.state_encoder = RNNStateEncoder(rnn_input_size, self._hidden_size) + + if 'rgb' in observation_space.spaces and not extra_rgb: + rgb_shape = observation_space.spaces['rgb'].shape + summary(self.visual_encoder.cnn, (rgb_shape[2], rgb_shape[0], rgb_shape[1]), device='cpu') + if 'depth' in observation_space.spaces: + depth_shape = observation_space.spaces['depth'].shape + summary(self.visual_encoder.cnn, (depth_shape[2], depth_shape[0], depth_shape[1]), device='cpu') + if self._audiogoal: + audio_shape = observation_space.spaces[audiogoal_sensor].shape + summary(self.audio_encoder.cnn, (audio_shape[2], audio_shape[0], audio_shape[1]), device='cpu') + + self.train() + + @property + def output_size(self): + return self._hidden_size + + @property + def is_blind(self): + return self.visual_encoder.is_blind + + @property + def num_recurrent_layers(self): + return self.state_encoder.num_recurrent_layers + + def forward(self, observations, rnn_hidden_states, prev_actions, masks): + x = [] + + if self._pointgoal: + x.append(observations[self.goal_sensor_uuid.split(DUAL_GOAL_DELIMITER)[0]]) + if self._audiogoal: + x.append(self.audio_encoder(observations)) + if not self.is_blind: + x.append(self.visual_encoder(observations)) + + x1 = torch.cat(x, dim=1) + x2, rnn_hidden_states1 = self.state_encoder(x1, rnn_hidden_states, masks) + + assert not torch.isnan(x2).any().item() + + return x2, rnn_hidden_states1 diff --git a/ss_baselines/av_nav/ppo/ppo.py b/ss_baselines/av_nav/ppo/ppo.py new file mode 100644 index 0000000..f5b78a0 --- /dev/null +++ b/ss_baselines/av_nav/ppo/ppo.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.optim as optim + +EPS_PPO = 1e-5 + + +class PPO(nn.Module): + def __init__( + self, + actor_critic, + clip_param, + ppo_epoch, + num_mini_batch, + value_loss_coef, + entropy_coef, + lr=None, + eps=None, + max_grad_norm=None, + use_clipped_value_loss=True, + use_normalized_advantage=True, + ): + + super().__init__() + + self.actor_critic = actor_critic + + self.clip_param = clip_param + self.ppo_epoch = ppo_epoch + self.num_mini_batch = num_mini_batch + + self.value_loss_coef = value_loss_coef + self.entropy_coef = entropy_coef + + self.max_grad_norm = max_grad_norm + self.use_clipped_value_loss = use_clipped_value_loss + + self.optimizer = optim.Adam(actor_critic.parameters(), lr=lr, eps=eps) + self.device = next(actor_critic.parameters()).device + self.use_normalized_advantage = use_normalized_advantage + + def forward(self, *x): + raise NotImplementedError + + def get_advantages(self, rollouts): + advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] + if not self.use_normalized_advantage: + return advantages + + return (advantages - advantages.mean()) / (advantages.std() + EPS_PPO) + + def update(self, rollouts): + advantages = self.get_advantages(rollouts) + + value_loss_epoch = 0 + action_loss_epoch = 0 + dist_entropy_epoch = 0 + + for e in range(self.ppo_epoch): + data_generator = rollouts.recurrent_generator( + advantages, self.num_mini_batch + ) + + for sample in data_generator: + ( + obs_batch, + recurrent_hidden_states_batch, + actions_batch, + prev_actions_batch, + value_preds_batch, + return_batch, + masks_batch, + old_action_log_probs_batch, + adv_targ, + ) = sample + + # Reshape to do in a single forward pass for all steps + ( + values, + action_log_probs, + dist_entropy, + _, + ) = self.actor_critic.evaluate_actions( + obs_batch, + recurrent_hidden_states_batch, + prev_actions_batch, + masks_batch, + actions_batch, + ) + + ratio = torch.exp( + action_log_probs - old_action_log_probs_batch + ) + surr1 = ratio * adv_targ + surr2 = ( + torch.clamp( + ratio, 1.0 - self.clip_param, 1.0 + self.clip_param + ) + * adv_targ + ) + action_loss = -torch.min(surr1, surr2).mean() + + if self.use_clipped_value_loss: + value_pred_clipped = value_preds_batch + ( + values - value_preds_batch + ).clamp(-self.clip_param, self.clip_param) + value_losses = (values - return_batch).pow(2) + value_losses_clipped = ( + value_pred_clipped - return_batch + ).pow(2) + value_loss = ( + 0.5 + * torch.max(value_losses, value_losses_clipped).mean() + ) + else: + value_loss = 0.5 * (return_batch - values).pow(2).mean() + + self.optimizer.zero_grad() + total_loss = ( + value_loss * self.value_loss_coef + + action_loss + - dist_entropy * self.entropy_coef + ) + + self.before_backward(total_loss) + total_loss.backward() + self.after_backward(total_loss) + + self.before_step() + self.optimizer.step() + self.after_step() + + value_loss_epoch += value_loss.item() + action_loss_epoch += action_loss.item() + dist_entropy_epoch += dist_entropy.item() + + num_updates = self.ppo_epoch * self.num_mini_batch + + value_loss_epoch /= num_updates + action_loss_epoch /= num_updates + dist_entropy_epoch /= num_updates + + return value_loss_epoch, action_loss_epoch, dist_entropy_epoch + + def before_backward(self, loss): + pass + + def after_backward(self, loss): + pass + + def before_step(self): + nn.utils.clip_grad_norm_( + self.actor_critic.parameters(), self.max_grad_norm + ) + + def after_step(self): + pass diff --git a/ss_baselines/av_nav/ppo/ppo_trainer.py b/ss_baselines/av_nav/ppo/ppo_trainer.py new file mode 100644 index 0000000..59d49d5 --- /dev/null +++ b/ss_baselines/av_nav/ppo/ppo_trainer.py @@ -0,0 +1,676 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +import time +import logging +from collections import deque +from typing import Dict, List +import json +import random + +import numpy as np +import torch +from torch.optim.lr_scheduler import LambdaLR +from tqdm import tqdm +from numpy.linalg import norm + +from habitat import Config, logger +from habitat.utils.visualizations.utils import observations_to_image +from ss_baselines.common.base_trainer import BaseRLTrainer +from ss_baselines.common.baseline_registry import baseline_registry +from ss_baselines.common.env_utils import construct_envs +from ss_baselines.common.environments import get_env_class +from ss_baselines.common.rollout_storage import RolloutStorage +from ss_baselines.common.tensorboard_utils import TensorboardWriter +from ss_baselines.common.utils import ( + batch_obs, + generate_video, + linear_decay, + plot_top_down_map, + resize_observation +) +from ss_baselines.av_nav.ppo.policy import AudioNavBaselinePolicy +from ss_baselines.av_nav.ppo.ppo import PPO + + +@baseline_registry.register_trainer(name="AVNavTrainer") +class PPOTrainer(BaseRLTrainer): + r"""Trainer class for PPO algorithm + Paper: https://arxiv.org/abs/1707.06347. + """ + supported_tasks = ["Nav-v0"] + + def __init__(self, config=None): + super().__init__(config) + self.actor_critic = None + self.agent = None + self.envs = None + + def _setup_actor_critic_agent(self, ppo_cfg: Config, observation_space=None) -> None: + r"""Sets up actor critic and agent for PPO. + + Args: + ppo_cfg: config node with relevant params + + Returns: + None + """ + logger.add_filehandler(self.config.LOG_FILE) + + if observation_space is None: + observation_space = self.envs.observation_spaces[0] + self.actor_critic = AudioNavBaselinePolicy( + observation_space=observation_space, + action_space=self.envs.action_spaces[0], + hidden_size=ppo_cfg.hidden_size, + goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, + extra_rgb=self.config.EXTRA_RGB + ) + self.actor_critic.to(self.device) + + self.agent = PPO( + actor_critic=self.actor_critic, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + ) + + def save_checkpoint(self, file_name: str) -> None: + r"""Save checkpoint with specified name. + + Args: + file_name: file name for checkpoint + + Returns: + None + """ + checkpoint = { + "state_dict": self.agent.state_dict(), + "config": self.config, + } + torch.save( + checkpoint, os.path.join(self.config.CHECKPOINT_FOLDER, file_name) + ) + + def load_checkpoint(self, checkpoint_path: str, *args, **kwargs) -> Dict: + r"""Load checkpoint of specified path as a dict. + + Args: + checkpoint_path: path of target checkpoint + *args: additional positional args + **kwargs: additional keyword args + + Returns: + dict containing checkpoint info + """ + return torch.load(checkpoint_path, *args, **kwargs) + + def _collect_rollout_step( + self, rollouts, current_episode_reward, current_episode_step, episode_rewards, + episode_spls, episode_counts, episode_steps + ): + pth_time = 0.0 + env_time = 0.0 + + t_sample_action = time.time() + # sample actions + with torch.no_grad(): + step_observation = { + k: v[rollouts.step] for k, v in rollouts.observations.items() + } + + ( + values, + actions, + actions_log_probs, + recurrent_hidden_states, + ) = self.actor_critic.act( + step_observation, + rollouts.recurrent_hidden_states[rollouts.step], + rollouts.prev_actions[rollouts.step], + rollouts.masks[rollouts.step], + ) + + pth_time += time.time() - t_sample_action + + t_step_env = time.time() + + outputs = self.envs.step([a[0].item() for a in actions]) + observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] + logging.debug('Reward: {}'.format(rewards[0])) + + env_time += time.time() - t_step_env + + t_update_stats = time.time() + batch = batch_obs(observations) + rewards = torch.tensor(rewards, dtype=torch.float) + rewards = rewards.unsqueeze(1) + + masks = torch.tensor( + [[0.0] if done else [1.0] for done in dones], dtype=torch.float + ) + spls = torch.tensor( + [[info['spl']] for info in infos] + ) + + current_episode_reward += rewards + current_episode_step += 1 + # current_episode_reward is accumulating rewards across multiple updates, + # as long as the current episode is not finished + # the current episode reward is added to the episode rewards only if the current episode is done + # the episode count will also increase by 1 + episode_rewards += (1 - masks) * current_episode_reward + episode_spls += (1 - masks) * spls + episode_steps += (1 - masks) * current_episode_step + episode_counts += 1 - masks + current_episode_reward *= masks + current_episode_step *= masks + + rollouts.insert( + batch, + recurrent_hidden_states, + actions, + actions_log_probs, + values, + rewards, + masks, + ) + + pth_time += time.time() - t_update_stats + + return pth_time, env_time, self.envs.num_envs + + def _update_agent(self, ppo_cfg, rollouts): + t_update_model = time.time() + with torch.no_grad(): + last_observation = { + k: v[-1] for k, v in rollouts.observations.items() + } + next_value = self.actor_critic.get_value( + last_observation, + rollouts.recurrent_hidden_states[-1], + rollouts.prev_actions[-1], + rollouts.masks[-1], + ).detach() + + rollouts.compute_returns( + next_value, ppo_cfg.use_gae, ppo_cfg.gamma, ppo_cfg.tau + ) + + value_loss, action_loss, dist_entropy = self.agent.update(rollouts) + + rollouts.after_update() + + return ( + time.time() - t_update_model, + value_loss, + action_loss, + dist_entropy, + ) + + def train(self) -> None: + r"""Main method for training PPO. + + Returns: + None + """ + logger.info(f"config: {self.config}") + random.seed(self.config.SEED) + np.random.seed(self.config.SEED) + torch.manual_seed(self.config.SEED) + + self.envs = construct_envs( + self.config, get_env_class(self.config.ENV_NAME) + ) + + ppo_cfg = self.config.RL.PPO + self.device = ( + torch.device("cuda", self.config.TORCH_GPU_ID) + if torch.cuda.is_available() + else torch.device("cpu") + ) + if not os.path.isdir(self.config.CHECKPOINT_FOLDER): + os.makedirs(self.config.CHECKPOINT_FOLDER) + self._setup_actor_critic_agent(ppo_cfg) + logger.info( + "agent number of parameters: {}".format( + sum(param.numel() for param in self.agent.parameters()) + ) + ) + + rollouts = RolloutStorage( + ppo_cfg.num_steps, + self.envs.num_envs, + self.envs.observation_spaces[0], + self.envs.action_spaces[0], + ppo_cfg.hidden_size, + ) + rollouts.to(self.device) + + observations = self.envs.reset() + batch = batch_obs(observations) + + for sensor in rollouts.observations: + rollouts.observations[sensor][0].copy_(batch[sensor]) + + # batch and observations may contain shared PyTorch CUDA + # tensors. We must explicitly clear them here otherwise + # they will be kept in memory for the entire duration of training! + batch = None + observations = None + + # episode_rewards and episode_counts accumulates over the entire training course + episode_rewards = torch.zeros(self.envs.num_envs, 1) + episode_spls = torch.zeros(self.envs.num_envs, 1) + episode_steps = torch.zeros(self.envs.num_envs, 1) + episode_counts = torch.zeros(self.envs.num_envs, 1) + current_episode_reward = torch.zeros(self.envs.num_envs, 1) + current_episode_step = torch.zeros(self.envs.num_envs, 1) + window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size) + window_episode_spl = deque(maxlen=ppo_cfg.reward_window_size) + window_episode_step = deque(maxlen=ppo_cfg.reward_window_size) + window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size) + + t_start = time.time() + env_time = 0 + pth_time = 0 + count_steps = 0 + count_checkpoints = 0 + + lr_scheduler = LambdaLR( + optimizer=self.agent.optimizer, + lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), + ) + + with TensorboardWriter( + self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) as writer: + for update in range(self.config.NUM_UPDATES): + if ppo_cfg.use_linear_lr_decay: + lr_scheduler.step() + + if ppo_cfg.use_linear_clip_decay: + self.agent.clip_param = ppo_cfg.clip_param * linear_decay( + update, self.config.NUM_UPDATES + ) + + for step in range(ppo_cfg.num_steps): + delta_pth_time, delta_env_time, delta_steps = self._collect_rollout_step( + rollouts, + current_episode_reward, + current_episode_step, + episode_rewards, + episode_spls, + episode_counts, + episode_steps + ) + pth_time += delta_pth_time + env_time += delta_env_time + count_steps += delta_steps + + delta_pth_time, value_loss, action_loss, dist_entropy = self._update_agent( + ppo_cfg, rollouts + ) + pth_time += delta_pth_time + + window_episode_reward.append(episode_rewards.clone()) + window_episode_spl.append(episode_spls.clone()) + window_episode_step.append(episode_steps.clone()) + window_episode_counts.append(episode_counts.clone()) + + losses = [value_loss, action_loss, dist_entropy] + stats = zip( + ["count", "reward", "step", 'spl'], + [window_episode_counts, window_episode_reward, window_episode_step, window_episode_spl], + ) + deltas = { + k: ( + (v[-1] - v[0]).sum().item() + if len(v) > 1 + else v[0].sum().item() + ) + for k, v in stats + } + deltas["count"] = max(deltas["count"], 1.0) + + # this reward is averaged over all the episodes happened during window_size updates + # approximately number of steps is window_size * num_steps + if update % 10 == 0: + writer.add_scalar("Environment/Reward", deltas["reward"] / deltas["count"], count_steps) + writer.add_scalar("Environment/SPL", deltas["spl"] / deltas["count"], count_steps) + writer.add_scalar("Environment/Episode_length", deltas["step"] / deltas["count"], count_steps) + writer.add_scalar('Policy/Value_Loss', value_loss, count_steps) + writer.add_scalar('Policy/Action_Loss', action_loss, count_steps) + writer.add_scalar('Policy/Entropy', dist_entropy, count_steps) + writer.add_scalar('Policy/Learning_Rate', lr_scheduler.get_lr()[0], count_steps) + + # log stats + if update > 0 and update % self.config.LOG_INTERVAL == 0: + logger.info( + "update: {}\tfps: {:.3f}\t".format( + update, count_steps / (time.time() - t_start) + ) + ) + + logger.info( + "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" + "frames: {}".format( + update, env_time, pth_time, count_steps + ) + ) + + window_rewards = ( + window_episode_reward[-1] - window_episode_reward[0] + ).sum() + window_counts = ( + window_episode_counts[-1] - window_episode_counts[0] + ).sum() + + if window_counts > 0: + logger.info( + "Average window size {} reward: {:3f}".format( + len(window_episode_reward), + (window_rewards / window_counts).item(), + ) + ) + else: + logger.info("No episodes finish in current window") + + # checkpoint model + if update % self.config.CHECKPOINT_INTERVAL == 0: + self.save_checkpoint(f"ckpt.{count_checkpoints}.pth") + count_checkpoints += 1 + + self.envs.close() + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0 + ) -> Dict: + r"""Evaluates a single checkpoint. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + random.seed(self.config.SEED) + np.random.seed(self.config.SEED) + torch.manual_seed(self.config.SEED) + + # Map location CPU is almost always better than mapping to a CUDA device. + ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") + + if self.config.EVAL.USE_CKPT_CONFIG: + config = self._setup_eval_config(ckpt_dict["config"]) + else: + config = self.config.clone() + + ppo_cfg = config.RL.PPO + + config.defrost() + config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT + if self.config.DISPLAY_RESOLUTION != config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.WIDTH: + model_resolution = config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.WIDTH + config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.WIDTH = config.TASK_CONFIG.SIMULATOR.RGB_SENSOR.HEIGHT = \ + config.TASK_CONFIG.SIMULATOR.RGB_SENSOR.WIDTH = config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.HEIGHT = \ + self.config.DISPLAY_RESOLUTION + else: + model_resolution = self.config.DISPLAY_RESOLUTION + config.freeze() + + if len(self.config.VIDEO_OPTION) > 0: + config.defrost() + config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") + config.freeze() + elif "top_down_map" in self.config.VISUALIZATION_OPTION: + config.defrost() + config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.freeze() + + logger.info(f"env config: {config}") + self.envs = construct_envs( + config, get_env_class(config.ENV_NAME) + ) + if self.config.DISPLAY_RESOLUTION != model_resolution: + observation_space = self.envs.observation_spaces[0] + observation_space.spaces['depth'].shape = (model_resolution, model_resolution, 1) + observation_space.spaces['rgb'].shape = (model_resolution, model_resolution, 1) + else: + observation_space = self.envs.observation_spaces[0] + self._setup_actor_critic_agent(ppo_cfg, observation_space) + + self.agent.load_state_dict(ckpt_dict["state_dict"]) + self.actor_critic = self.agent.actor_critic + + self.metric_uuids = [] + # get name of performance metric, e.g. "spl" + for metric_name in self.config.TASK_CONFIG.TASK.MEASUREMENTS: + metric_cfg = getattr(self.config.TASK_CONFIG.TASK, metric_name) + measure_type = baseline_registry.get_measure(metric_cfg.TYPE) + assert measure_type is not None, "invalid measurement type {}".format( + metric_cfg.TYPE + ) + self.metric_uuids.append(measure_type(sim=None, task=None, config=None)._get_uuid()) + + observations = self.envs.reset() + if self.config.DISPLAY_RESOLUTION != model_resolution: + resize_observation(observations, model_resolution) + batch = batch_obs(observations, self.device) + + current_episode_reward = torch.zeros( + self.envs.num_envs, 1, device=self.device + ) + + test_recurrent_hidden_states = torch.zeros( + self.actor_critic.net.num_recurrent_layers, + self.config.NUM_PROCESSES, + ppo_cfg.hidden_size, + device=self.device, + ) + prev_actions = torch.zeros( + self.config.NUM_PROCESSES, 1, device=self.device, dtype=torch.long + ) + not_done_masks = torch.zeros( + self.config.NUM_PROCESSES, 1, device=self.device + ) + stats_episodes = dict() # dict of dicts that stores stats per episode + + rgb_frames = [ + [] for _ in range(self.config.NUM_PROCESSES) + ] # type: List[List[np.ndarray]] + audios = [ + [] for _ in range(self.config.NUM_PROCESSES) + ] + if len(self.config.VIDEO_OPTION) > 0: + os.makedirs(self.config.VIDEO_DIR, exist_ok=True) + + t = tqdm(total=self.config.TEST_EPISODE_COUNT) + while ( + len(stats_episodes) < self.config.TEST_EPISODE_COUNT + and self.envs.num_envs > 0 + ): + current_episodes = self.envs.current_episodes() + + with torch.no_grad(): + _, actions, _, test_recurrent_hidden_states = self.actor_critic.act( + batch, + test_recurrent_hidden_states, + prev_actions, + not_done_masks, + deterministic=False + ) + + prev_actions.copy_(actions) + + outputs = self.envs.step([a[0].item() for a in actions]) + + observations, rewards, dones, infos = [ + list(x) for x in zip(*outputs) + ] + for i in range(self.envs.num_envs): + if len(self.config.VIDEO_OPTION) > 0: + if config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE and 'intermediate' in observations[i]: + for observation in observations[i]['intermediate']: + frame = observations_to_image(observation, infos[i]) + rgb_frames[i].append(frame) + del observations[i]['intermediate'] + + if "rgb" not in observations[i]: + observations[i]["rgb"] = np.zeros((self.config.DISPLAY_RESOLUTION, + self.config.DISPLAY_RESOLUTION, 3)) + frame = observations_to_image(observations[i], infos[i]) + rgb_frames[i].append(frame) + audios[i].append(observations[i]['audiogoal']) + + if config.DISPLAY_RESOLUTION != model_resolution: + resize_observation(observations, model_resolution) + batch = batch_obs(observations, self.device) + + not_done_masks = torch.tensor( + [[0.0] if done else [1.0] for done in dones], + dtype=torch.float, + device=self.device, + ) + + rewards = torch.tensor( + rewards, dtype=torch.float, device=self.device + ).unsqueeze(1) + current_episode_reward += rewards + next_episodes = self.envs.current_episodes() + envs_to_pause = [] + for i in range(self.envs.num_envs): + # pause envs which runs out of episodes + if ( + next_episodes[i].scene_id, + next_episodes[i].episode_id, + ) in stats_episodes: + envs_to_pause.append(i) + + # episode ended + if not_done_masks[i].item() == 0: + episode_stats = dict() + for metric_uuid in self.metric_uuids: + episode_stats[metric_uuid] = infos[i][metric_uuid] + episode_stats["reward"] = current_episode_reward[i].item() + episode_stats['geodesic_distance'] = current_episodes[i].info['geodesic_distance'] + episode_stats['euclidean_distance'] = norm(np.array(current_episodes[i].goals[0].position) - + np.array(current_episodes[i].start_position)) + logging.debug(episode_stats) + current_episode_reward[i] = 0 + # use scene_id + episode_id as unique id for storing stats + stats_episodes[ + ( + current_episodes[i].scene_id, + current_episodes[i].episode_id, + ) + ] = episode_stats + t.update() + + if len(self.config.VIDEO_OPTION) > 0: + fps = self.config.TASK_CONFIG.SIMULATOR.VIEW_CHANGE_FPS \ + if self.config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE else 1 + generate_video( + video_option=self.config.VIDEO_OPTION, + video_dir=self.config.VIDEO_DIR, + images=rgb_frames[i][:-1], + scene_name=current_episodes[i].scene_id.split('/')[3], + sound=current_episodes[i].info['sound'], + sr=self.config.TASK_CONFIG.SIMULATOR.AUDIO.RIR_SAMPLING_RATE, + episode_id=current_episodes[i].episode_id, + checkpoint_idx=checkpoint_index, + metric_name='spl', + metric_value=infos[i]['spl'], + tb_writer=writer, + audios=audios[i][:-1], + fps=fps + ) + + # observations has been reset but info has not + # to be consistent, do not use the last frame + rgb_frames[i] = [] + audios[i] = [] + + if "top_down_map" in self.config.VISUALIZATION_OPTION: + top_down_map = plot_top_down_map(infos[i], + dataset=self.config.TASK_CONFIG.SIMULATOR.SCENE_DATASET) + scene = current_episodes[i].scene_id.split('/')[3] + writer.add_image('{}_{}_{}/{}'.format(config.EVAL.SPLIT, scene, current_episodes[i].episode_id, + config.BASE_TASK_CONFIG_PATH.split('/')[-1][:-5]), + top_down_map, + dataformats='WHC') + + ( + self.envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) = self._pause_envs( + envs_to_pause, + self.envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) + + aggregated_stats = dict() + for stat_key in next(iter(stats_episodes.values())).keys(): + aggregated_stats[stat_key] = sum( + [v[stat_key] for v in stats_episodes.values()] + ) + num_episodes = len(stats_episodes) + + stats_file = os.path.join(config.TENSORBOARD_DIR, '{}_stats_{}.json'.format(config.EVAL.SPLIT, config.SEED)) + new_stats_episodes = {','.join(key): value for key, value in stats_episodes.items()} + with open(stats_file, 'w') as fo: + json.dump(new_stats_episodes, fo) + + episode_reward_mean = aggregated_stats["reward"] / num_episodes + episode_metrics_mean = {} + for metric_uuid in self.metric_uuids: + episode_metrics_mean[metric_uuid] = aggregated_stats[metric_uuid] / num_episodes + + logger.info(f"Average episode reward: {episode_reward_mean:.6f}") + for metric_uuid in self.metric_uuids: + logger.info( + f"Average episode {metric_uuid}: {episode_metrics_mean[metric_uuid]:.6f}" + ) + + if not config.EVAL.SPLIT.startswith('test'): + writer.add_scalar("{}/reward".format(config.EVAL.SPLIT), episode_reward_mean, checkpoint_index) + for metric_uuid in self.metric_uuids: + writer.add_scalar(f"{config.EVAL.SPLIT}/{metric_uuid}", episode_metrics_mean[metric_uuid], + checkpoint_index) + + self.envs.close() + + result = { + 'episode_reward_mean': episode_reward_mean + } + for metric_uuid in self.metric_uuids: + result['episode_{}_mean'.format(metric_uuid)] = episode_metrics_mean[metric_uuid] + + return result diff --git a/ss_baselines/av_nav/run.py b/ss_baselines/av_nav/run.py new file mode 100644 index 0000000..3a0bbc6 --- /dev/null +++ b/ss_baselines/av_nav/run.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os + +import warnings +warnings.filterwarnings('ignore', category=FutureWarning) +warnings.filterwarnings('ignore', category=UserWarning) +import torch + +import soundspaces +from ss_baselines.common.baseline_registry import baseline_registry +from ss_baselines.av_nav.config.default import get_config +from ss_baselines.av_wan.run import find_best_ckpt_idx + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--run-type", + choices=["train", "eval"], + # required=True, + default='train', + help="run type of the experiment (train or eval)", + ) + parser.add_argument( + "--exp-config", + type=str, + # required=True, + default='av_nav/config/pointgoal_rgb.yaml', + help="path to config yaml containing info about experiment", + ) + parser.add_argument( + "opts", + default=None, + nargs=argparse.REMAINDER, + help="Modify config options from command line", + ) + parser.add_argument( + "--model-dir", + default=None, + help="Modify config options from command line", + ) + parser.add_argument( + "--eval-interval", + type=int, + default=1, + help="Evaluation interval of checkpoints", + ) + parser.add_argument( + "--overwrite", + default=False, + action='store_true', + help="Modify config options from command line" + ) + parser.add_argument( + "--eval-best", + default=False, + action='store_true', + help="Modify config options from command line" + ) + parser.add_argument( + "--prev-ckpt-ind", + type=int, + default=-1, + help="Evaluation interval of checkpoints", + ) + args = parser.parse_args() + + if args.eval_best: + best_ckpt_idx = find_best_ckpt_idx(os.path.join(args.model_dir, 'tb')) + best_ckpt_path = os.path.join(args.model_dir, 'data', f'ckpt.{best_ckpt_idx}.pth') + print(f'Evaluating the best checkpoint: {best_ckpt_path}') + args.opts += ['EVAL_CKPT_PATH_DIR', best_ckpt_path] + + # run exp + config = get_config(args.exp_config, args.opts, args.model_dir, args.run_type, args.overwrite) + trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME) + assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported" + trainer = trainer_init(config) + torch.set_num_threads(1) + + level = logging.DEBUG if config.DEBUG else logging.INFO + logging.basicConfig(level=level, format='%(asctime)s, %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S") + + if args.run_type == "train": + trainer.train() + elif args.run_type == "eval": + trainer.eval(args.eval_interval, args.prev_ckpt_ind, config.USE_LAST_CKPT) + + +if __name__ == "__main__": + main() diff --git a/ss_baselines/av_wan/README.md b/ss_baselines/av_wan/README.md new file mode 100644 index 0000000..24a02ab --- /dev/null +++ b/ss_baselines/av_wan/README.md @@ -0,0 +1,30 @@ +# Audio-Visual Waypoints (AV-WaN) Model + +## Details +This folder provides the code of the model as well as the training/evaluation configurations used in the +[Learning to Set Waypoints for Audio-Visual Navigation](https://arxiv.org/pdf/2008.09622.pdf) paper. +Use of this model is the similar as described in the usage section of the main README file. +Simply replace av_nav with av_wan in the command. + +Note that the numbers in the paper were initially reported on Habitat-Lab v0.1.5. Later versions of Habitat-Lab +seed the random seeds a bit differently. The difference of performance should be within 1%. +Pretrained weights are provided. + + +## Evaluating pretrained model +``` +py ss_baselines/av_wan/run.py --run-type eval --exp-config ss_baselines/av_wan/config/audionav/replica/test_with_am.yaml EVAL_CKPT_PATH_DIR data/pretrained_weights/audionav/av_wan/replica/heard.pth +py ss_baselines/av_wan/run.py --run-type eval --exp-config ss_baselines/av_wan/config/audionav/replica/test_with_am.yaml EVAL_CKPT_PATH_DIR data/pretrained_weights/audionav/av_wan/replica/unheard.pth EVAL.SPLIT test_multiple_unheard +``` + + +## Citation +If you use this model in your research, please cite the following paper: +``` +@inproceedings{chen21avwan, + title = {Learning to Set Waypoints for Audio-Visual Navigation, + author = {Changan Chen, Sagnik Majumder, Ziad Al-Halah, Ruohan Gao, Santhosh K. Ramakrishnan, Kristen Grauman}, + booktitle = {ICLR}, + year = {2021} +} +``` \ No newline at end of file diff --git a/ss_baselines/av_wan/__init__.py b/ss_baselines/av_wan/__init__.py new file mode 100644 index 0000000..a194de4 --- /dev/null +++ b/ss_baselines/av_wan/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from ss_baselines.av_wan.ppo.ppo_trainer import PPOTrainer +from ss_baselines.av_wan.avwan_sensors import * +from ss_baselines.av_wan.mapnav_env import MapNavEnv diff --git a/ss_baselines/av_wan/avwan_sensors.py b/ss_baselines/av_wan/avwan_sensors.py new file mode 100644 index 0000000..85aea3c --- /dev/null +++ b/ss_baselines/av_wan/avwan_sensors.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, Type, Union +import logging + +import numpy as np +from gym import spaces + +from habitat.config import Config +from habitat.core.dataset import Episode +from habitat.core.registry import registry +from habitat.core.simulator import ( + Sensor, + SensorTypes, + Simulator, +) + + +class MapPlaceHolder(Sensor): + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + super().__init__(config=config) + + def _get_uuid(self, *args: Any, **kwargs: Any): + raise NotImplementedError + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=1, + shape=(self.config.MAP_SIZE, self.config.MAP_SIZE, self.config.NUM_CHANNEL), + dtype=np.uint8, + ) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + return np.zeros((self.config.MAP_SIZE, self.config.MAP_SIZE, self.config.NUM_CHANNEL)) + + +@registry.register_sensor(name="GeometricMap") +class GeometricMap(MapPlaceHolder): + def _get_uuid(self, *args: Any, **kwargs: Any): + return "gm" + + +@registry.register_sensor(name="ActionMap") +class ActionMap(MapPlaceHolder): + def _get_uuid(self, *args: Any, **kwargs: Any): + return "action_map" + + +@registry.register_sensor(name="AcousticMap") +class AcousticMap(MapPlaceHolder): + def _get_uuid(self, *args: Any, **kwargs: Any): + return "am" + + +@registry.register_sensor(name="Intensity") +class Intensity(Sensor): + def __init__( + self, sim: Union[Simulator, Config], config: Config, *args: Any, **kwargs: Any + ): + super().__init__(config=config) + self._sim = sim + + def _get_uuid(self, *args: Any, **kwargs: Any): + return "intensity" + + def _get_sensor_type(self, *args: Any, **kwargs: Any): + return SensorTypes.COLOR + + def _get_observation_space(self, *args: Any, **kwargs: Any): + return spaces.Box( + low=0, + high=1, + shape=(1,), + dtype=bool + ) + + def get_observation( + self, *args: Any, observations, episode: Episode, **kwargs: Any + ) -> object: + num_frame = 150 + audiogoal = self._sim.get_current_audiogoal_observation() + nonzero_idx = np.min((audiogoal > 0.1 * audiogoal.max()).argmax(axis=1)) + impulse = audiogoal[:, nonzero_idx: nonzero_idx + num_frame] + rms = np.mean(impulse ** 2) + + return [rms] diff --git a/ss_baselines/av_wan/config/__init__.py b/ss_baselines/av_wan/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/av_wan/config/audionav/mp3d/test_with_am.yaml b/ss_baselines/av_wan/config/audionav/mp3d/test_with_am.yaml new file mode 100644 index 0000000..3ca4b4a --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/test_with_am.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/test_without_am.yaml b/ss_baselines/av_wan/config/audionav/mp3d/test_without_am.yaml new file mode 100644 index 0000000..d41bea9 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/test_without_am.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_am.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/test_without_both.yaml b/ss_baselines/av_wan/config/audionav/mp3d/test_without_both.yaml new file mode 100644 index 0000000..a8f4d26 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/test_without_both.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_both.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/test_without_gm.yaml b/ss_baselines/av_wan/config/audionav/mp3d/test_without_gm.yaml new file mode 100644 index 0000000..b7c399f --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/test_without_gm.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_gm.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/train_with_am.yaml b/ss_baselines/av_wan/config/audionav/mp3d/train_with_am.yaml new file mode 100644 index 0000000..03447f3 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/train_with_am.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/train_without_am.yaml b/ss_baselines/av_wan/config/audionav/mp3d/train_without_am.yaml new file mode 100644 index 0000000..ff16916 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/train_without_am.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_am.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/train_without_both.yaml b/ss_baselines/av_wan/config/audionav/mp3d/train_without_both.yaml new file mode 100644 index 0000000..6ba72d8 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/train_without_both.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_both.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/train_without_gm.yaml b/ss_baselines/av_wan/config/audionav/mp3d/train_without_gm.yaml new file mode 100644 index 0000000..e67e7d2 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/train_without_gm.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_gm.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/val_with_am.yaml b/ss_baselines/av_wan/config/audionav/mp3d/val_with_am.yaml new file mode 100644 index 0000000..bf1ac92 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/val_with_am.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/val_without_am.yaml b/ss_baselines/av_wan/config/audionav/mp3d/val_without_am.yaml new file mode 100644 index 0000000..f3e7a9d --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/val_without_am.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_am.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/val_without_both.yaml b/ss_baselines/av_wan/config/audionav/mp3d/val_without_both.yaml new file mode 100644 index 0000000..faf69c4 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/val_without_both.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_both.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/mp3d/val_without_gm.yaml b/ss_baselines/av_wan/config/audionav/mp3d/val_without_gm.yaml new file mode 100644 index 0000000..efafe98 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/mp3d/val_without_gm.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/mp3d/audiogoal_without_gm.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/test_with_am.yaml b/ss_baselines/av_wan/config/audionav/replica/test_with_am.yaml new file mode 100644 index 0000000..1e80007 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/test_with_am.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/test_without_am.yaml b/ss_baselines/av_wan/config/audionav/replica/test_without_am.yaml new file mode 100644 index 0000000..7f4ef20 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/test_without_am.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_am.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/test_without_both.yaml b/ss_baselines/av_wan/config/audionav/replica/test_without_both.yaml new file mode 100644 index 0000000..d75a263 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/test_without_both.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_both.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/test_without_gm.yaml b/ss_baselines/av_wan/config/audionav/replica/test_without_gm.yaml new file mode 100644 index 0000000..fb7ecc7 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/test_without_gm.yaml @@ -0,0 +1,14 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_gm.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 1000 +#VIDEO_OPTION: ["tensorboard"] +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: False + + +EVAL: + SPLIT: "test_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/train_with_am.yaml b/ss_baselines/av_wan/config/audionav/replica/train_with_am.yaml new file mode 100644 index 0000000..8f0ed1a --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/train_with_am.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/train_without_am.yaml b/ss_baselines/av_wan/config/audionav/replica/train_without_am.yaml new file mode 100644 index 0000000..15ca4b0 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/train_without_am.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_am.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/train_without_both.yaml b/ss_baselines/av_wan/config/audionav/replica/train_without_both.yaml new file mode 100644 index 0000000..642eac2 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/train_without_both.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_both.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/train_without_gm.yaml b/ss_baselines/av_wan/config/audionav/replica/train_without_gm.yaml new file mode 100644 index 0000000..a3d6101 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/train_without_gm.yaml @@ -0,0 +1,42 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_gm.yaml" +NUM_PROCESSES: 5 +SENSORS: ["DEPTH_SENSOR"] +NUM_UPDATES: 10000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False + + +RL: + SUCCESS_REWARD: 10.0 + SLACK_REWARD: -0.01 + WITH_TIME_PENALTY: True + WITH_DISTANCE_REWARD: True + DISTANCE_REWARD_SCALE: 0.25 + WITH_PREDICTION_REWARD: False + GOAL_PREDICTION_SCALE: 1.0 + + PPO: + # ppo params + clip_param: 0.1 + ppo_epoch: 4 + num_mini_batch: 1 + value_loss_coef: 0.5 + entropy_coef: 0.02 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.5 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: True + use_linear_lr_decay: True + use_exponential_lr_decay: False + exp_decay_lambda: 5.0 + # window size for calculating the past rewards + reward_window_size: 50 \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/val_with_am.yaml b/ss_baselines/av_wan/config/audionav/replica/val_with_am.yaml new file mode 100644 index 0000000..202be66 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/val_with_am.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/val_without_am.yaml b/ss_baselines/av_wan/config/audionav/replica/val_without_am.yaml new file mode 100644 index 0000000..b3a15fc --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/val_without_am.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_am.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/val_without_both.yaml b/ss_baselines/av_wan/config/audionav/replica/val_without_both.yaml new file mode 100644 index 0000000..dd4291e --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/val_without_both.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_both.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/audionav/replica/val_without_gm.yaml b/ss_baselines/av_wan/config/audionav/replica/val_without_gm.yaml new file mode 100644 index 0000000..b02d440 --- /dev/null +++ b/ss_baselines/av_wan/config/audionav/replica/val_without_gm.yaml @@ -0,0 +1,13 @@ +BASE_TASK_CONFIG_PATH: "configs/audionav/av_wan/replica/audiogoal_without_gm.yaml" +NUM_PROCESSES: 1 +SENSORS: ["DEPTH_SENSOR"] +TEST_EPISODE_COUNT: 500 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +ENCODE_DEPTH: False +USE_SYNC_VECENV: True + + +EVAL: + SPLIT: "val_telephone" + USE_CKPT_CONFIG: True \ No newline at end of file diff --git a/ss_baselines/av_wan/config/default.py b/ss_baselines/av_wan/config/default.py new file mode 100644 index 0000000..d51ba89 --- /dev/null +++ b/ss_baselines/av_wan/config/default.py @@ -0,0 +1,295 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Union +import os +import logging +import shutil + + +from habitat import get_config as get_task_config +from habitat.config import Config as CN +from habitat.config.default import SIMULATOR_SENSOR +import habitat + +DEFAULT_CONFIG_DIR = "configs/" +CONFIG_FILE_SEPARATOR = "," +# ----------------------------------------------------------------------------- +# EXPERIMENT CONFIG +# ----------------------------------------------------------------------------- +_C = CN() +_C.SEED = 0 +_C.BASE_TASK_CONFIG_PATH = "configs/tasks/pointnav.yaml" +_C.TASK_CONFIG = CN() # task_config will be stored as a config node +_C.CMD_TRAILING_OPTS = [] # store command line options as list of strings +_C.TRAINER_NAME = "AVWanTrainer" +_C.ENV_NAME = "MapNavEnv" +_C.SIMULATOR_GPU_ID = 0 +_C.TORCH_GPU_ID = 0 +_C.MODEL_DIR = 'data/models/output' +_C.VIDEO_OPTION = ["disk", "tensorboard"] +_C.VISUALIZATION_OPTION = ["top_down_map"] +_C.TENSORBOARD_DIR = "tb" +_C.VIDEO_DIR = "video_dir" +_C.TEST_EPISODE_COUNT = 2 +_C.EVAL_CKPT_PATH_DIR = "data/checkpoints" # path to ckpt or path to ckpts dir +_C.NUM_PROCESSES = 16 +_C.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] +_C.CHECKPOINT_FOLDER = "data/checkpoints" +_C.NUM_UPDATES = 10000 +_C.LOG_INTERVAL = 10 +_C.LOG_FILE = "train.log" +_C.CHECKPOINT_INTERVAL = 50 +_C.USE_VECENV = True +_C.USE_SYNC_VECENV = False +_C.ENCODE_RGB = True +_C.ENCODE_DEPTH = True +_C.DEBUG = False +_C.USE_LAST_CKPT = False +_C.PREDICTION_INTERVAL = 10 +_C.DATASET_FILTER = [] +_C.VISUALIZE_FAILURE_ONLY = False +_C.MASKING = True +_C.DISPLAY_RESOLUTION = 128 +# ----------------------------------------------------------------------------- +# EVAL CONFIG +# ----------------------------------------------------------------------------- +_C.EVAL = CN() +# The split to evaluate on +_C.EVAL.SPLIT = "val" +_C.EVAL.USE_CKPT_CONFIG = True +# ----------------------------------------------------------------------------- +# REINFORCEMENT LEARNING (RL) ENVIRONMENT CONFIG +# ----------------------------------------------------------------------------- +_C.RL = CN() +_C.RL.SUCCESS_REWARD = 10.0 +_C.RL.SLACK_REWARD = -0.01 +_C.RL.WITH_TIME_PENALTY = True +_C.RL.WITH_DISTANCE_REWARD = True +_C.RL.DISTANCE_REWARD_SCALE = 1.0 +_C.RL.WITH_PREDICTION_REWARD = False +_C.RL.GOAL_PREDICTION_SCALE = 1.0 +_C.RL.TIME_DIFF = False +# ----------------------------------------------------------------------------- +# PROXIMAL POLICY OPTIMIZATION (PPO) +# ----------------------------------------------------------------------------- +_C.RL.PPO = CN() +_C.RL.PPO.clip_param = 0.2 +_C.RL.PPO.ppo_epoch = 4 +_C.RL.PPO.num_mini_batch = 16 +_C.RL.PPO.value_loss_coef = 0.5 +_C.RL.PPO.entropy_coef = 0.01 +_C.RL.PPO.lr = 7e-4 +_C.RL.PPO.eps = 1e-5 +_C.RL.PPO.max_grad_norm = 0.5 +_C.RL.PPO.num_steps = 5 +_C.RL.PPO.hidden_size = 512 +_C.RL.PPO.use_gae = True +_C.RL.PPO.use_linear_lr_decay = False +_C.RL.PPO.use_linear_clip_decay = False +_C.RL.PPO.use_exponential_lr_decay = False +_C.RL.PPO.exp_decay_lambda = 1.0 +_C.RL.PPO.gamma = 0.99 +_C.RL.PPO.tau = 0.95 +_C.RL.PPO.reward_window_size = 50 +# ----------------------------------------------------------------------------- +# TASK CONFIG +# ----------------------------------------------------------------------------- +_TC = habitat.get_config() +_TC.defrost() +_TC.ENVIRONMENT.ITERATOR_OPTIONS.MAX_SCENE_REPEAT_STEPS = int(1e4) +# ----------------------------------------------------------------------------- +# AUDIOGOAL_SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.AUDIOGOAL_SENSOR = CN() +_TC.TASK.AUDIOGOAL_SENSOR.TYPE = "AudioGoalSensor" +# ----------------------------------------------------------------------------- +# SPECTROGRAM_SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.SPECTROGRAM_SENSOR = CN() +_TC.TASK.SPECTROGRAM_SENSOR.TYPE = "SpectrogramSensor" +# ----------------------------------------------------------------------------- +# habitat_audio +# ----------------------------------------------------------------------------- +_TC.SIMULATOR.GRID_SIZE = 0.5 +_TC.SIMULATOR.CONTINUOUS_VIEW_CHANGE = False +_TC.SIMULATOR.VIEW_CHANGE_FPS = 10 +_TC.SIMULATOR.SCENE_DATASET = 'replica' +_TC.SIMULATOR.USE_RENDERED_OBSERVATIONS = True +_TC.SIMULATOR.SCENE_OBSERVATION_DIR = 'data/scene_observations' +_TC.SIMULATOR.AUDIO = CN() +_TC.SIMULATOR.AUDIO.SCENE = "" +_TC.SIMULATOR.AUDIO.BINAURAL_RIR_DIR = "data/binaural_rirs" +_TC.SIMULATOR.AUDIO.RIR_SAMPLING_RATE = 44100 +_TC.SIMULATOR.AUDIO.SOURCE_SOUND_DIR = "data/sounds/1s_all" +_TC.SIMULATOR.AUDIO.METADATA_DIR = "data/metadata" +_TC.SIMULATOR.AUDIO.POINTS_FILE = 'points.txt' +_TC.SIMULATOR.AUDIO.GRAPH_FILE = 'graph.pkl' +_TC.SIMULATOR.AUDIO.HAS_DISTRACTOR_SOUND = False +_TC.SIMULATOR.AUDIO.EVERLASTING = True +# ----------------------------------------------------------------------------- +# DistanceToGoal Measure +# ----------------------------------------------------------------------------- +_TC.TASK.DISTANCE_TO_GOAL = CN() +_TC.TASK.DISTANCE_TO_GOAL.TYPE = "DistanceToGoal" +_TC.TASK.DISTANCE_TO_GOAL.DISTANCE_TO = "POINT" +# ----------------------------------------------------------------------------- +# NormalizedDistanceToGoal Measure +# ----------------------------------------------------------------------------- +_TC.TASK.NORMALIZED_DISTANCE_TO_GOAL = CN() +_TC.TASK.NORMALIZED_DISTANCE_TO_GOAL.TYPE = "NormalizedDistanceToGoal" +# ----------------------------------------------------------------------------- +# Dataset extension +# ----------------------------------------------------------------------------- +_TC.DATASET.VERSION = 'v1' +# ----------------------------------------------------------------------------- +# Egocentric occupancy map projected from depth image +# ----------------------------------------------------------------------------- +_TC.TASK.EGOMAP_SENSOR = SIMULATOR_SENSOR.clone() +_TC.TASK.EGOMAP_SENSOR.TYPE = "EgoMap" +_TC.TASK.EGOMAP_SENSOR.MAP_SIZE = 31 +_TC.TASK.EGOMAP_SENSOR.MAP_RESOLUTION = 0.1 +_TC.TASK.EGOMAP_SENSOR.HEIGHT_THRESH = (0.5, 2.0) +# ----------------------------------------------------------------------------- +# Global map placeholder +# ----------------------------------------------------------------------------- +_TC.TASK.GEOMETRIC_MAP = SIMULATOR_SENSOR.clone() +_TC.TASK.GEOMETRIC_MAP.TYPE = "GeometricMap" +_TC.TASK.GEOMETRIC_MAP.MAP_SIZE = 200 +_TC.TASK.GEOMETRIC_MAP.INTERNAL_MAP_SIZE = 500 +_TC.TASK.GEOMETRIC_MAP.MAP_RESOLUTION = 0.1 +_TC.TASK.GEOMETRIC_MAP.NUM_CHANNEL = 2 +# ----------------------------------------------------------------------------- +# Acoustic map placeholder +# ----------------------------------------------------------------------------- +_TC.TASK.ACOUSTIC_MAP = SIMULATOR_SENSOR.clone() +_TC.TASK.ACOUSTIC_MAP.TYPE = "AcousticMap" +_TC.TASK.ACOUSTIC_MAP.MAP_SIZE = 20 +_TC.TASK.ACOUSTIC_MAP.MAP_RESOLUTION = 0.5 +_TC.TASK.ACOUSTIC_MAP.NUM_CHANNEL = 1 +_TC.TASK.ACOUSTIC_MAP.ENCODING = "average_intensity" +# ----------------------------------------------------------------------------- +# Local occupancy map placeholder +# ----------------------------------------------------------------------------- +_TC.TASK.ACTION_MAP = SIMULATOR_SENSOR.clone() +_TC.TASK.ACTION_MAP.TYPE = "ActionMap" +_TC.TASK.ACTION_MAP.MAP_SIZE = 9 +_TC.TASK.ACTION_MAP.MAP_RESOLUTION = 0.5 +_TC.TASK.ACTION_MAP.NUM_CHANNEL = 1 +# ----------------------------------------------------------------------------- +# Collision Sensor in habitat-audio +# ----------------------------------------------------------------------------- +_TC.TASK.COLLISION = SIMULATOR_SENSOR.clone() +_TC.TASK.COLLISION.TYPE = "Collision" +# ----------------------------------------------------------------------------- +# Intensity value placeholder +# ----------------------------------------------------------------------------- +_TC.TASK.INTENSITY = SIMULATOR_SENSOR.clone() +_TC.TASK.INTENSITY.TYPE = "Intensity" +# ----------------------------------------------------------------------------- +# Number of action metric +# ----------------------------------------------------------------------------- +_TC.TASK.NUM_ACTION = CN() +_TC.TASK.NUM_ACTION.TYPE = "NA" +# ----------------------------------------------------------------------------- +# Success normalized by number of action metric +# ----------------------------------------------------------------------------- +_TC.TASK.SUCCESS_WEIGHTED_BY_NUM_ACTION = CN() +_TC.TASK.SUCCESS_WEIGHTED_BY_NUM_ACTION.TYPE = "SNA" + + +def merge_from_path(config, config_paths): + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + return config + + +def get_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None, + model_dir: Optional[str] = None, + run_type: Optional[str] = None, + overwrite: bool = False +) -> CN: + r"""Create a unified config with default values overwritten by values from + `config_paths` and overwritten by options from `opts`. + Args: + config_paths: List of config paths or string that contains comma + separated list of config paths. + opts: Config options (keys, values) in a list (e.g., passed from + command line into the config. For example, `opts = ['FOO.BAR', + 0.5]`. Argument can be used for parameter sweeping or quick tests. + model_dir: suffix for output dirs + run_type: either train or eval + overwrite: overwrite model directory + """ + config = merge_from_path(_C.clone(), config_paths) + config.TASK_CONFIG = get_task_config(config_paths=config.BASE_TASK_CONFIG_PATH) + + # config_name = os.path.basename(config_paths).split('.')[0] + if model_dir is not None: + config.MODEL_DIR = model_dir + config.TENSORBOARD_DIR = os.path.join(config.MODEL_DIR, 'tb') + config.CHECKPOINT_FOLDER = os.path.join(config.MODEL_DIR, 'data') + config.VIDEO_DIR = os.path.join(config.MODEL_DIR, 'video_dir') + config.LOG_FILE = os.path.join(config.MODEL_DIR, 'train.log') + config.EVAL_CKPT_PATH_DIR = os.path.join(config.MODEL_DIR, 'data') + + if opts: + config.CMD_TRAILING_OPTS = opts + config.merge_from_list(opts) + + dirs = [config.VIDEO_DIR, config.TENSORBOARD_DIR, config.CHECKPOINT_FOLDER] + if run_type == 'train': + # check dirs + if any([os.path.exists(d) for d in dirs]): + for d in dirs: + if os.path.exists(d): + logging.warning('{} exists'.format(d)) + # if overwrite or input('Output directory already exists! Overwrite the folder? (y/n)') == 'y': + if overwrite: + for d in dirs: + if os.path.exists(d): + shutil.rmtree(d) + + config.TASK_CONFIG.defrost() + config.TASK_CONFIG.SIMULATOR.USE_SYNC_VECENV = config.USE_SYNC_VECENV + config.TASK_CONFIG.freeze() + + config.freeze() + return config + + +def get_task_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None +) -> habitat.Config: + config = _TC.clone() + config.set_new_allowed(False) + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + + if opts: + config.merge_from_list(opts) + + config.freeze() + return config diff --git a/ss_baselines/av_wan/mapnav_env.py b/ss_baselines/av_wan/mapnav_env.py new file mode 100644 index 0000000..9cf5db9 --- /dev/null +++ b/ss_baselines/av_wan/mapnav_env.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional +import logging + +import numpy as np +import habitat +import torch +from habitat import Config, Dataset +from habitat.utils.visualizations.utils import observations_to_image +from ss_baselines.common.baseline_registry import baseline_registry +from ss_baselines.av_wan.models.planner import Planner + + +@baseline_registry.register_env(name="MapNavEnv") +class MapNavEnv(habitat.RLEnv): + def __init__(self, config: Config, dataset: Optional[Dataset] = None): + self._config = config + self._rl_config = config.RL + self._core_env_config = config.TASK_CONFIG + + self._previous_target_distance = None + self._previous_action = None + self._previous_observation = None + self._episode_distance_covered = None + self._success_distance = self._core_env_config.TASK.SUCCESS_DISTANCE + super().__init__(self._core_env_config, dataset) + + self.planner = Planner(model_dir=self._config.MODEL_DIR, + use_acoustic_map='ACOUSTIC_MAP' in config.TASK_CONFIG.TASK.SENSORS, + masking=self._config.MASKING, + task_config=config.TASK_CONFIG + ) + torch.set_num_threads(1) + + def reset(self): + self._previous_action = None + + observations = super().reset() + self.planner.update_map_and_graph(observations) + self.planner.add_maps_to_observation(observations) + self._previous_observation = observations + logging.debug(super().current_episode) + + self._previous_target_distance = self.habitat_env.current_episode.info[ + "geodesic_distance" + ] + return observations + + def step(self, *args, **kwargs): + intermediate_goal = kwargs["action"] + self._previous_action = intermediate_goal + goal = self.planner.get_map_coordinates(intermediate_goal) + stop = int(self._config.TASK_CONFIG.TASK.ACTION_MAP.MAP_SIZE ** 2 // 2) == intermediate_goal + observation = self._previous_observation + cumulative_reward = 0 + done = False + reaching_waypoint = False + cant_reach_waypoint = False + if len(self._config.VIDEO_OPTION) > 0: + rgb_frames = list() + audios = list() + + for step_count in range(self._config.PREDICTION_INTERVAL): + if step_count != 0 and not self.planner.check_navigability(goal): + cant_reach_waypoint = True + break + action = self.planner.plan(observation, goal, stop=stop) + observation, reward, done, info = super().step({"action": action}) + if len(self._config.VIDEO_OPTION) > 0: + if "rgb" not in observation: + observation["rgb"] = np.zeros((self.config.DISPLAY_RESOLUTION, + self.config.DISPLAY_RESOLUTION, 3)) + frame = observations_to_image(observation, info) + rgb_frames.append(frame) + audios.append(observation['audiogoal']) + cumulative_reward += reward + if done: + self.planner.reset() + observation = self.reset() + break + else: + self.planner.update_map_and_graph(observation) + # reaching intermediate goal + x, y = self.planner.mapper.get_maps_and_agent_pose()[2:4] + if (x - goal[0]) == (y - goal[1]) == 0: + reaching_waypoint = True + break + + if not done: + self.planner.add_maps_to_observation(observation) + self._previous_observation = observation + info['reaching_waypoint'] = done or reaching_waypoint + info['cant_reach_waypoint'] = cant_reach_waypoint + if len(self._config.VIDEO_OPTION) > 0: + assert len(rgb_frames) != 0 + info['rgb_frames'] = rgb_frames + info['audios'] = audios + + return observation, cumulative_reward, done, info + + def get_reward_range(self): + return ( + self._rl_config.SLACK_REWARD - 1.0, + self._rl_config.SUCCESS_REWARD + 1.0, + ) + + def get_reward(self, observations): + reward = 0 + + if self._rl_config.WITH_TIME_PENALTY: + reward += self._rl_config.SLACK_REWARD + + if self._rl_config.WITH_DISTANCE_REWARD: + current_target_distance = self._distance_target() + # if current_target_distance < self._previous_target_distance: + reward += (self._previous_target_distance - current_target_distance) * self._rl_config.DISTANCE_REWARD_SCALE + self._previous_target_distance = current_target_distance + + if self._episode_success(): + reward += self._rl_config.SUCCESS_REWARD + logging.debug('Reaching goal!') + + return reward + + def _distance_target(self): + current_position = self._env.sim.get_agent_state().position.tolist() + target_position = [goal.position for goal in self._env.current_episode.goals] + distance = self._env.sim.geodesic_distance( + current_position, target_position + ) + return distance + + def _episode_success(self): + if ( + self._env.task.is_stop_called + # and self._distance_target() < self._success_distance + and self._env.sim.reaching_goal + ): + return True + return False + + def get_done(self, observations): + done = False + if self._env.episode_over or self._episode_success(): + done = True + return done + + def get_info(self, observations): + return self.habitat_env.get_metrics() + + # for data collection + def get_current_episode_id(self): + return self.habitat_env.current_episode.episode_id + + def global_to_egocentric(self, pg): + return self.planner.mapper.global_to_egocentric(*pg) + + def egocentric_to_global(self, pg): + return self.planner.mapper.egocentric_to_global(*pg) diff --git a/ss_baselines/av_wan/models/__init__.py b/ss_baselines/av_wan/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/av_wan/models/audio_cnn.py b/ss_baselines/av_wan/models/audio_cnn.py new file mode 100644 index 0000000..1b6ec39 --- /dev/null +++ b/ss_baselines/av_wan/models/audio_cnn.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import torch.nn as nn + +from ss_baselines.common.utils import Flatten +from ss_baselines.av_nav.models.visual_cnn import conv_output_dim, layer_init + + +class AudioCNN(nn.Module): + r"""A Simple 3-Conv CNN for processing audio spectrogram + + Args: + observation_space: The observation_space of the agent + output_size: The size of the embedding vector + """ + + def __init__(self, observation_space, output_size): + super().__init__() + self._n_input_audio = observation_space.spaces["spectrogram"].shape[2] + + cnn_dims = np.array( + observation_space.spaces["spectrogram"].shape[:2], dtype=np.float32 + ) + + if cnn_dims[0] < 30 or cnn_dims[1] < 30: + self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)] + self._cnn_layers_stride = [(2, 2), (2, 2), (1, 1)] + else: + self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] + self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] + + for kernel_size, stride in zip( + self._cnn_layers_kernel_size, self._cnn_layers_stride + ): + cnn_dims = conv_output_dim( + dimension=cnn_dims, + padding=np.array([0, 0], dtype=np.float32), + dilation=np.array([1, 1], dtype=np.float32), + kernel_size=np.array(kernel_size, dtype=np.float32), + stride=np.array(stride, dtype=np.float32), + ) + + self.cnn = nn.Sequential( + nn.Conv2d( + in_channels=self._n_input_audio, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[0], + stride=self._cnn_layers_stride[0], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=32, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[1], + stride=self._cnn_layers_stride[1], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=64, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[2], + stride=self._cnn_layers_stride[2], + ), + # nn.ReLU(True), + Flatten(), + nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), + nn.ReLU(True), + ) + + layer_init(self.cnn) + + def forward(self, observations): + cnn_input = [] + + audio_observations = observations["spectrogram"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + audio_observations = audio_observations.permute(0, 3, 1, 2) + cnn_input.append(audio_observations) + + cnn_input = torch.cat(cnn_input, dim=1) + + return self.cnn(cnn_input) diff --git a/ss_baselines/av_wan/models/map_cnn.py b/ss_baselines/av_wan/models/map_cnn.py new file mode 100644 index 0000000..fba9fd0 --- /dev/null +++ b/ss_baselines/av_wan/models/map_cnn.py @@ -0,0 +1,104 @@ +# !/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import torch.nn as nn + +from ss_baselines.common.utils import Flatten +from ss_baselines.av_nav.models.visual_cnn import conv_output_dim, layer_init + + +class MapCNN(nn.Module): + r"""A Simple CNN for processing map inputs (acoustic map or geometric map) + + Args: + observation_space: The observation_space of the agent + output_size: The size of the embedding vector + """ + + def __init__(self, observation_space, output_size, map_type='gm'): + super().__init__() + self._map_type = map_type + self._n_input_gm = observation_space.spaces[map_type].shape[2] + + cnn_dims = np.array( + observation_space.spaces[map_type].shape[:2], dtype=np.float32 + ) + # input image of dimension N reduces to (ceil((N-f+1)/s),ceil((N-f+1)/s),Number of filters) + # where f is the filter size and s is the stride length + # kernel size for different CNN layers + if self._map_type == 'gm': + if cnn_dims[0] == 200: + self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] + + # strides for different CNN layers + self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] + else: + assert cnn_dims[0] == 400 + self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] + + # strides for different CNN layers + self._cnn_layers_stride = [(5, 5), (4, 4), (2, 2)] + elif self._map_type == 'am': + self._cnn_layers_kernel_size = [(5, 5), (3, 3), (3, 3)] + + # strides for different CNN layers + self._cnn_layers_stride = [(2, 2), (1, 1), (1, 1)] + + for kernel_size, stride in zip( + self._cnn_layers_kernel_size, self._cnn_layers_stride + ): + cnn_dims = conv_output_dim( + dimension=cnn_dims, + padding=np.array([0, 0], dtype=np.float32), + dilation=np.array([1, 1], dtype=np.float32), + kernel_size=np.array(kernel_size, dtype=np.float32), + stride=np.array(stride, dtype=np.float32), + ) + + self.cnn = nn.Sequential( + nn.Conv2d( + in_channels=self._n_input_gm, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[0], + stride=self._cnn_layers_stride[0], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=32, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[1], + stride=self._cnn_layers_stride[1], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=64, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[2], + stride=self._cnn_layers_stride[2], + ), + # nn.ReLU(True), + Flatten(), + nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), + nn.ReLU(True), + ) + + layer_init(self.cnn) + + def forward(self, observations): + cnn_input = [] + + gm_observations = observations[self._map_type] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + gm_observations = gm_observations.permute(0, 3, 1, 2) + cnn_input.append(gm_observations) + + cnn_input = torch.cat(cnn_input, dim=1) + + return self.cnn(cnn_input) diff --git a/ss_baselines/av_wan/models/mapper.py b/ss_baselines/av_wan/models/mapper.py new file mode 100644 index 0000000..68018cd --- /dev/null +++ b/ss_baselines/av_wan/models/mapper.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple +import logging + +import torch.nn as nn +import torch +import numpy as np + +from habitat.sims.habitat_simulator.actions import HabitatSimActions + + +def to_array(x): + if torch.is_tensor(x): + x = x.cpu().numpy() + else: + x = x + return x + + +class Mapper(nn.Module): + def __init__(self, gm_config, am_config, action_map_config, use_acoustic_map): + super(Mapper, self).__init__() + self._internal_gm_size = gm_config.INTERNAL_MAP_SIZE + self._gm_size = gm_config.MAP_SIZE + self._gm_res = gm_config.MAP_RESOLUTION + self._use_acoustic_map = use_acoustic_map + self._am_encoding = am_config.ENCODING + self._action_map_res = action_map_config.MAP_RESOLUTION + self._stride = int(self._action_map_res / self._gm_res) + # allocentric map w.r.t the agent's initial pose + # both global map and local maps are square, the first channel is obstacle map and the second is explored map + self._geometric_map = None + self._prev_geometric_map = None + self._acoustic_map = None + self._x = None + self._y = None + self._orientation = None + self._initial_orientation = None + self._navigable_xs = None + self._navigable_ys = None + self._rotated_xs = dict() + self._rotated_ys = dict() + + self.reset() + + def compute_navigable_xys(self): + navigable_xs = [] + for n in range(int(-self._x / self._stride), int((self._internal_gm_size - self._x) / self._stride)): + navigable_xs.append(self._x + n * self._stride) + + navigable_ys = [] + for n in range(int(-self._y / self._stride), int((self._internal_gm_size - self._y) / self._stride)): + navigable_ys.append(self._y + n * self._stride) + + self._navigable_xs, self._navigable_ys = navigable_xs, navigable_ys + + for angle in [0, 90, 180, 270]: + navigable_xs = [] + navigable_ys = [] + for a, b in zip(self._navigable_xs, self._navigable_ys): + c, d = transform_coordinates(a, b, angle, self._geometric_map.shape[1], self._geometric_map.shape[0]) + navigable_xs.append(c) + navigable_ys.append(d) + navigable_xs = sorted(navigable_xs) + navigable_ys = sorted(navigable_ys) + self._rotated_xs[angle] = navigable_xs + self._rotated_ys[angle] = navigable_ys + + return self._navigable_xs, self._navigable_ys + + def reset(self): + self._geometric_map = np.zeros((self._internal_gm_size, self._internal_gm_size, 2)) + if self._use_acoustic_map: + if self._am_encoding == 'intensity': + num_channel = 1 + elif self._am_encoding == 'average_intensity': + num_channel = 1 + else: + raise ValueError + self._acoustic_map = np.zeros((self._internal_gm_size // self._stride, + self._internal_gm_size // self._stride, num_channel)) + self._x = int(self._internal_gm_size / 2) + self._y = int(self._internal_gm_size / 2) + # set the initial orientation to be 270 on X-Z plane in 3D coordinate frame + self._orientation = 270 + self._initial_orientation = self._orientation + + @property + def _rotation(self): + # orientation increases clockwise, rotation increases counterclockwise + return -(self._orientation - self._initial_orientation) + + def update(self, prev_action: int, ego_map, intensity) -> Tuple[list, list]: + if logging.root.level == logging.DEBUG: + self._prev_geometric_map = np.copy(self._geometric_map) + + if prev_action == HabitatSimActions.MOVE_FORWARD: + self._x += int(self._stride * np.cos(np.deg2rad(self._orientation))) + self._y += int(self._stride * np.sin(np.deg2rad(self._orientation))) + elif prev_action == HabitatSimActions.TURN_LEFT: + self._orientation = (self._orientation - 90) % 360 + elif prev_action == HabitatSimActions.TURN_RIGHT: + self._orientation = (self._orientation + 90) % 360 + else: + # do nothing for the first step + pass + + # update global map + rotated_geometric_map = rotate_map(self._geometric_map, -self._rotation, create_copy=False) + rotated_x, rotated_y = transform_coordinates(self._x, self._y, -self._rotation, + self._geometric_map.shape[1], self._geometric_map.shape[0]) + left = rotated_x - int(ego_map.shape[1] / 2) + right = left + ego_map.shape[1] + # does not update the agent's current location + top = rotated_y + bottom = top - ego_map.shape[0] + rotated_geometric_map[bottom: top, left: right, :] = \ + np.logical_or(rotated_geometric_map[bottom: top, left: right, :] > 0.5, ego_map > 0.5) + + # update acoustic map + if self._use_acoustic_map: + am_x = self._x // self._stride + am_y = self._y // self._stride + if self._am_encoding == 'intensity': + self._acoustic_map[am_y, am_x, 0] = intensity + elif self._am_encoding == 'average_intensity': + if self._acoustic_map[am_y, am_x] == 0: + self._acoustic_map[am_y, am_x] = intensity + else: + self._acoustic_map[am_y, am_x] = 0.5 * intensity + 0.5 * self._acoustic_map[am_y, am_x] + + # compute new blocked paths and non-navigable points in the affected region + new_left = max(left - self._stride, 0) + new_bottom = max(bottom - self._stride, 0) + new_right = min(right + self._stride, self._geometric_map.shape[1]) + new_top = min(top + self._stride, self._geometric_map.shape[0]) + m = self._stride + navigable_xs = [] + for n in range(int((new_left - rotated_x) / m), int((new_right + 1 - rotated_x) / m)): + navigable_xs.append(rotated_x + n * m) + navigable_ys = [] + for n in range(int((new_bottom - rotated_y) / m), int((new_top + 1 - rotated_y) / m)): + navigable_ys.append(rotated_y + n * m) + + def convert(a, b): + return transform_coordinates(a, b, self._rotation, rotated_geometric_map.shape[1], rotated_geometric_map.shape[0]) + + non_navigable_points = [] + blocked_paths = [] + for idx_y, y in enumerate(navigable_ys): + for idx_x, x in enumerate(navigable_xs): + if rotated_geometric_map[y, x, 0]: + if x == rotated_x and y == rotated_y: + logging.warning("Mapper: marked current position as obstacle") + self._geometric_map[self._y, self._x, 0] = 0 + else: + non_navigable_points.append(convert(x, y)) + + # no obstacle to the next navigable point along +Z direction + if idx_y < len(navigable_ys) - 1: + next_y = navigable_ys[idx_y + 1] + if any(rotated_geometric_map[y: next_y + 1, x, 0]): + blocked_paths.append((convert(x, y), convert(x, next_y))) + + # no obstacle to the next navigable point along +X direction + if idx_x < len(navigable_xs) - 1: + next_x = navigable_xs[idx_x + 1] + if any(rotated_geometric_map[y, x: next_x + 1, 0]): + blocked_paths.append((convert(x, y), convert(next_x, y))) + assert (self._x, self._y) not in non_navigable_points + return non_navigable_points, blocked_paths + + def get_adjacent_point_coordinates(self): + return self._x + int(self._stride * np.cos(np.deg2rad(self._orientation))), \ + self._y + int(self._stride * np.sin(np.deg2rad(self._orientation))) + + def get_maps_and_agent_pose(self): + return self._geometric_map, self._acoustic_map, self._x, self._y, self._orientation + + def get_orientation(self): + return self._orientation + + def egocentric_to_allocentric(self, delta_x, delta_y, action_map_res=None): + """ + apply the agent's rotation to the relative delta_x, delta_y, rotates counterclockwise + + """ + if action_map_res is not None: + delta_x *= int(action_map_res / self._gm_res) + delta_y *= int(action_map_res / self._gm_res) + rotation = self._rotation % 360 + if rotation == 0: + return delta_x, delta_y + elif rotation == 90: + return delta_y, -delta_x + elif rotation == 180: + return -delta_x, -delta_y + else: + return -delta_y, delta_x + + def allocentric_to_egocentric(self, x, y, action_map_res=None): + if action_map_res is not None: + x /= int(action_map_res / self._gm_res) + y /= int(action_map_res / self._gm_res) + + rotation = self._rotation % 360 + if rotation == 0: + return x, y + elif rotation == 90: + return -y, x + elif rotation == 180: + return -x, -y + else: + return y, -x + + def global_to_egocentric(self, x, y): + return self.allocentric_to_egocentric(x - self._x, y - self._y, self._action_map_res) + + def egocentric_to_global(self, delta_x, delta_y): + allocentric = self.egocentric_to_allocentric(delta_x, delta_y, self._action_map_res) + return self._x + allocentric[0], self._y + allocentric[1] + + def is_explored(self, x, y): + return self._geometric_map[y][x][1] > 0.5 + + def get_egocentric_geometric_map(self): + # crop internal gm to external gm + rotated_geometric_map = rotate_map(self._geometric_map, -self._rotation, create_copy=False) + x, y = transform_coordinates(self._x, self._y, -self._rotation, + self._geometric_map.shape[1], self._geometric_map.shape[0]) + map_size = rotated_geometric_map.shape[0] + + cropped_map = np.zeros((self._gm_size, self._gm_size, self._geometric_map.shape[2])) + top = max(self._gm_size // 2 - y, 0) + left = max(self._gm_size // 2 - x, 0) + bottom = min(map_size + self._gm_size // 2 - y, self._gm_size) + right = min(map_size + self._gm_size // 2 - x, self._gm_size) + cropped_map[top: bottom, left: right] = \ + rotated_geometric_map[max(y - self._gm_size // 2, 0): + min(y + self._gm_size // 2, map_size), + max(x - self._gm_size // 2, 0): + min(x + self._gm_size // 2, map_size), :] + + return cropped_map + + def get_egocentric_acoustic_map(self, crop_map_size=20): + channels = [] + + if self._am_encoding == 'intensity': + acoustic_map = self._acoustic_map + elif self._am_encoding == 'average_intensity': + acoustic_map = self._acoustic_map + else: + raise ValueError('Encoding does not exist') + rotated_acoustic_map = rotate_map(acoustic_map, -self._rotation, create_copy=False) + x, y = transform_coordinates(self._x // self._stride, self._y // self._stride, + -self._rotation, acoustic_map.shape[1], acoustic_map.shape[0]) + map_size = rotated_acoustic_map.shape[0] + cropped_map = np.zeros((crop_map_size, crop_map_size, rotated_acoustic_map.shape[2])) + top = max(crop_map_size // 2 - y, 0) + left = max(crop_map_size // 2 - x, 0) + bottom = min(map_size + crop_map_size // 2 - y, crop_map_size) + right = min(map_size + crop_map_size // 2 - x, crop_map_size) + cropped_map[top: bottom, left: right] = \ + rotated_acoustic_map[max(y - crop_map_size // 2, 0): + min(y + crop_map_size // 2, map_size), + max(x - crop_map_size // 2, 0): + min(x + crop_map_size // 2, map_size), :] + channels.append(cropped_map) + channels = np.concatenate(channels, axis=2) + + return channels + + def get_egocentric_occupancy_map(self, size, action_map_res): + # 1 represent free space and 0 represents occupancy + rotated_geometric_map = rotate_map(self._geometric_map, -self._rotation, create_copy=False) + x, y = transform_coordinates(self._x, self._y, -self._rotation, + self._geometric_map.shape[1], self._geometric_map.shape[0]) + grid_map = rotated_geometric_map[np.ix_(self._rotated_ys[-self._rotation % 360], + self._rotated_xs[-self._rotation % 360])] + grid_x = x // self._stride + grid_y = y // self._stride + ego_om = 1 - grid_map[grid_y - size // 2: grid_y + size // 2 + 1, + grid_x - size // 2: grid_x + size // 2 + 1, 0] + + if logging.root.level == logging.DEBUG: + for j in range(size): + for i in range(size): + navigability = ego_om[j, i] + pg_x = int(i - size // 2) + pg_y = int(j - size // 2) + delta_x, delta_y = self.egocentric_to_allocentric(pg_x, pg_y, action_map_res=action_map_res) + goal_x = self._x + delta_x + goal_y = self._y + delta_y + assert navigability == (grid_map[grid_y + pg_y, grid_x + pg_x, 0] == 0) \ + == (self._geometric_map[goal_y, goal_x, 0] == 0) + + return ego_om + + +def rotate_map(om: np.array, rotation: float, create_copy=True) -> np.array: + """ + rotate the input map counterclockwise + :param om: + :param rotation: counterclockwise, from axis 0 to axis 1 + :param create_copy: decides whether the returned map is a copy of the original + :return: + """ + rotation = rotation % 360 + if create_copy: + rotated_map = np.copy(om) + else: + rotated_map = om + + if rotation != 0: + rotated_map = np.rot90(rotated_map, k=int(rotation / 90)) + + return rotated_map + + +def transform_coordinates(x: int, y: int, rotation: int, width: int, height: int) -> Tuple[int, int]: + """ + Rotates x,y counterclockwise + """ + rotation = rotation % 360 + if rotation == 0: + new_x = x + new_y = y + elif rotation == 90: + new_x = y + new_y = width - x - 1 + elif rotation == 180: + new_x = width - x - 1 + new_y = height - y - 1 + else: + new_x = height - y - 1 + new_y = x + + return new_x, new_y \ No newline at end of file diff --git a/ss_baselines/av_wan/models/planner.py b/ss_baselines/av_wan/models/planner.py new file mode 100644 index 0000000..d03e42d --- /dev/null +++ b/ss_baselines/av_wan/models/planner.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os + +import networkx as nx +import numpy as np +import torch +from habitat.sims.habitat_simulator.actions import HabitatSimActions +from ss_baselines.av_wan.models.mapper import Mapper, to_array + + +class Planner: + def __init__(self, task_config=None, use_acoustic_map=False, model_dir=None, masking=True): + self.mapper = Mapper( + gm_config=task_config.TASK.GEOMETRIC_MAP, + am_config=task_config.TASK.ACOUSTIC_MAP, + action_map_config=task_config.TASK.ACTION_MAP, + use_acoustic_map=use_acoustic_map + ) + + self._action_map_res = task_config.TASK.ACTION_MAP.MAP_RESOLUTION + self._action_map_size = task_config.TASK.ACTION_MAP.MAP_SIZE + self._prev_depth = None + self._prev_next_node = None + self._prev_action = None + self._obstacles = [] + self._obstacle_threshold = 0.5 + self._navigable_xs, self._navigable_ys = self.mapper.compute_navigable_xys() + self._graph = self._map_to_graph(self.mapper.get_maps_and_agent_pose()[0]) + self._removed_edges = list() + self._removed_nodes = list() + self._model_dir = model_dir + self._masking = masking + + self.reset() + + def reset(self): + self._prev_depth = None + self._prev_next_node = None + self._prev_action = None + self._obstacles = [] + self.mapper.reset() + self._graph.add_nodes_from(self._removed_nodes) + self._graph.add_edges_from(self._removed_edges) + self._removed_nodes.clear() + self._removed_edges.clear() + + def update_map_and_graph(self, observation): + ego_map = to_array(observation['ego_map']) + depth = to_array(observation['depth']) + collided = to_array(observation['collision'][0]) + intensity = to_array(observation['intensity'][0]) if 'intensity' in observation else None + + geometric_map, acoustic_map, x, y, orientation = self.mapper.get_maps_and_agent_pose() + if not collided: + non_navigable_points, blocked_paths = self.mapper.update(self._prev_action, ego_map, intensity) + self._update_graph(non_navigable_points, blocked_paths) + elif self._prev_next_node in self._graph.nodes: + # only the edge to the previous next node should be removed + current_node = self._map_index_to_graph_nodes([(x, y)])[0] + self._graph.remove_edge(self._prev_next_node, current_node) + self._removed_edges.append((self._prev_next_node, current_node)) + self._prev_depth = depth + + if logging.root.level == logging.DEBUG: + geometric_map, acoustic_map, x, y, orientation = self.mapper.get_maps_and_agent_pose() + assert not geometric_map[y, x, 0] + for node, attr in self._removed_nodes: + index = attr['map_index'] + assert self.mapper._geometric_map[index[1], index[0]][0] + + def add_maps_to_observation(self, observation): + if 'gm' in observation: + observation['gm'] = self.mapper.get_egocentric_geometric_map().astype(np.float32) + if 'am' in observation: + observation['am'] = self.mapper.get_egocentric_acoustic_map().astype(np.float32) + if 'action_map' in observation: + observation['action_map'] = np.expand_dims(self.mapper.get_egocentric_occupancy_map( + size=self._action_map_size, action_map_res=self._action_map_res), -1).astype(np.float32) + + def plan(self, observation: dict, goal, stop, distribution=None) -> torch.Tensor: + geometric_map, acoustic_map, x, y, orientation = self.mapper.get_maps_and_agent_pose() + graph_nodes = self._map_index_to_graph_nodes([(x, y), (goal[0], goal[1])]) + + next_node = next_node_idx = None + if stop: + action = HabitatSimActions.STOP + self._prev_next_node = None + else: + try: + shortest_path = nx.shortest_path(self._graph, source=graph_nodes[0], target=graph_nodes[1]) + # decide if the agent needs to rotate based on the connectivity with the next node + next_node_idx = self._graph.nodes[shortest_path[1]]['map_index'] + self._prev_next_node = shortest_path[1] + desired_orientation = np.round( + np.rad2deg(np.arctan2(next_node_idx[1] - y, next_node_idx[0] - x))) % 360 + rotation = (desired_orientation - orientation) % 360 + + # egocentric frame where the agent faces +x direction + if rotation == 0: + action = HabitatSimActions.MOVE_FORWARD + elif rotation == 90: + action = HabitatSimActions.TURN_RIGHT + elif rotation == 180: + action = np.random.choice([HabitatSimActions.TURN_LEFT, HabitatSimActions.TURN_RIGHT]) + elif rotation == 270: + action = HabitatSimActions.TURN_LEFT + else: + raise ValueError('Invalid rotation') + except (nx.exception.NetworkXNoPath, nx.exception.NodeNotFound) as e: + assert not (self._masking and isinstance(e, nx.exception.NodeNotFound)) + # randomly select a node from neighbors + adjacent_point_coordinates = self.mapper.get_adjacent_point_coordinates() + adjacent_node = self._map_index_to_graph_nodes([adjacent_point_coordinates])[0] + if adjacent_node in self._graph.nodes and (graph_nodes[0], adjacent_node) in self._graph.edges: + action = np.random.choice([HabitatSimActions.MOVE_FORWARD, HabitatSimActions.TURN_LEFT, + HabitatSimActions.TURN_RIGHT]) + else: + action = np.random.choice([HabitatSimActions.TURN_LEFT, HabitatSimActions.TURN_RIGHT]) + self._prev_next_node = None + self._prev_action = action + + return action + + def get_map_coordinates(self, relative_goal): + map_size = self._action_map_size + geometric_map, acoustic_map, x, y, orientation = self.mapper.get_maps_and_agent_pose() + pg_y, pg_x = np.unravel_index(relative_goal, (map_size, map_size)) + pg_x = int(pg_x - map_size // 2) + pg_y = int(pg_y - map_size // 2) + + # transform goal location to be in the global coordinate frame + delta_x, delta_y = self.mapper.egocentric_to_allocentric(pg_x, pg_y, action_map_res=self._action_map_res) + return x + delta_x, y + delta_y + + def check_navigability(self, goal): + geometric_map, acoustic_map, x, y, orientation = self.mapper.get_maps_and_agent_pose() + graph_nodes = self._map_index_to_graph_nodes([(x, y), goal]) + navigable = graph_nodes[1] in self._graph.nodes, graph_nodes[1] in self._graph.nodes \ + and nx.has_path(self._graph, source=graph_nodes[0], target=graph_nodes[1]) + + return all(navigable) + + def _update_graph(self, non_navigable_points, blocked_paths): + non_navigable_nodes = self._map_index_to_graph_nodes(non_navigable_points) + blocked_edges = [self._map_index_to_graph_nodes([a, b]) for a, b in blocked_paths] + + for node in non_navigable_nodes: + if node in self._graph.nodes: + self._removed_nodes.append((node, self._graph.nodes[node])) + self._removed_edges += [(node, neighbor) for neighbor in self._graph[node]] + self._removed_edges += blocked_edges + + self._graph.remove_nodes_from(non_navigable_nodes) + self._graph.remove_edges_from(blocked_edges) + + def _map_index_to_graph_nodes(self, map_indices: list) -> list: + graph_nodes = list() + for map_index in map_indices: + graph_nodes.append(map_index[1] * len(self._navigable_ys) + map_index[0]) + return graph_nodes + + def _map_to_graph(self, geometric_map: np.array) -> nx.Graph: + # after bitwise_and op, 0 indicates free or unexplored, 1 indicate obstacles + occupancy_map = np.bitwise_and(geometric_map[:, :, 0] >= self._obstacle_threshold, + geometric_map[:, :, 1] >= self._obstacle_threshold) + graph = nx.Graph() + for idx_y, y in enumerate(self._navigable_ys): + for idx_x, x in enumerate(self._navigable_xs): + node_index = y * len(self._navigable_ys) + x + + if occupancy_map[y][x]: + # obstacle + continue + + # no obstacle to the next navigable point along +Z direction + if idx_y < len(self._navigable_ys) - 1: + next_y = self._navigable_ys[idx_y + 1] + if not any(occupancy_map[y: next_y+1, x]): + next_node_index = next_y * len(self._navigable_ys) + x + if node_index not in graph: + graph.add_node(node_index, map_index=(x, y)) + if next_node_index not in graph: + graph.add_node(next_node_index, map_index=(x, next_y)) + graph.add_edge(node_index, next_node_index) + + # no obstacle to the next navigable point along +X direction + if idx_x < len(self._navigable_xs) - 1: + next_x = self._navigable_xs[idx_x + 1] + if not any(occupancy_map[y, x: next_x+1]): + next_node_index = y * len(self._navigable_ys) + next_x + if node_index not in graph: + graph.add_node(node_index, map_index=(x, y)) + if next_node_index not in graph: + graph.add_node(next_node_index, map_index=(next_x, y)) + graph.add_edge(node_index, next_node_index) + + # trim the graph such that it only keeps the largest subgraph + connected_subgraphs = (graph.subgraph(c) for c in nx.connected_components(graph)) + max_connected_graph = max(connected_subgraphs, key=len) + + return nx.Graph(max_connected_graph) diff --git a/ss_baselines/av_wan/models/visual_cnn.py b/ss_baselines/av_wan/models/visual_cnn.py new file mode 100644 index 0000000..c2c7b9a --- /dev/null +++ b/ss_baselines/av_wan/models/visual_cnn.py @@ -0,0 +1,118 @@ +# !/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +import torch.nn as nn + +from ss_baselines.common.utils import Flatten +from ss_baselines.av_nav.models.visual_cnn import conv_output_dim, layer_init + + +class VisualCNN(nn.Module): + r"""A Simple 3-Conv CNN followed by a fully connected layer + + Takes in observations and produces an embedding of the rgb and/or depth components + + Args: + observation_space: The observation_space of the agent + output_size: The size of the embedding vector + """ + + def __init__(self, observation_space, output_size, encode_rgb, encode_depth): + super().__init__() + if "rgb" in observation_space.spaces and encode_rgb: + self._n_input_rgb = observation_space.spaces["rgb"].shape[2] + else: + self._n_input_rgb = 0 + + if "depth" in observation_space.spaces and encode_depth: + self._n_input_depth = observation_space.spaces["depth"].shape[2] + else: + self._n_input_depth = 0 + + # kernel size for different CNN layers + self._cnn_layers_kernel_size = [(8, 8), (4, 4), (3, 3)] + + # strides for different CNN layers + self._cnn_layers_stride = [(4, 4), (2, 2), (1, 1)] + + if self._n_input_rgb > 0: + cnn_dims = np.array( + observation_space.spaces["rgb"].shape[:2], dtype=np.float32 + ) + elif self._n_input_depth > 0: + cnn_dims = np.array( + observation_space.spaces["depth"].shape[:2], dtype=np.float32 + ) + + if self.is_blind: + self.cnn = nn.Sequential() + else: + for kernel_size, stride in zip( + self._cnn_layers_kernel_size, self._cnn_layers_stride + ): + cnn_dims = conv_output_dim( + dimension=cnn_dims, + padding=np.array([0, 0], dtype=np.float32), + dilation=np.array([1, 1], dtype=np.float32), + kernel_size=np.array(kernel_size, dtype=np.float32), + stride=np.array(stride, dtype=np.float32), + ) + + self.cnn = nn.Sequential( + nn.Conv2d( + in_channels=self._n_input_rgb + self._n_input_depth, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[0], + stride=self._cnn_layers_stride[0], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=32, + out_channels=64, + kernel_size=self._cnn_layers_kernel_size[1], + stride=self._cnn_layers_stride[1], + ), + nn.ReLU(True), + nn.Conv2d( + in_channels=64, + out_channels=32, + kernel_size=self._cnn_layers_kernel_size[2], + stride=self._cnn_layers_stride[2], + ), + # nn.ReLU(True), + Flatten(), + nn.Linear(32 * cnn_dims[0] * cnn_dims[1], output_size), + nn.ReLU(True), + ) + + layer_init(self.cnn) + + @property + def is_blind(self): + return self._n_input_rgb + self._n_input_depth == 0 + + def forward(self, observations): + cnn_input = [] + if self._n_input_rgb > 0: + rgb_observations = observations["rgb"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + rgb_observations = rgb_observations.permute(0, 3, 1, 2) + rgb_observations = rgb_observations / 255.0 # normalize RGB + cnn_input.append(rgb_observations) + + if self._n_input_depth > 0: + depth_observations = observations["depth"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + depth_observations = depth_observations.permute(0, 3, 1, 2) + cnn_input.append(depth_observations) + + cnn_input = torch.cat(cnn_input, dim=1) + + return self.cnn(cnn_input) diff --git a/ss_baselines/av_wan/ppo/__init__.py b/ss_baselines/av_wan/ppo/__init__.py new file mode 100644 index 0000000..18e86e6 --- /dev/null +++ b/ss_baselines/av_wan/ppo/__init__.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from ss_baselines.av_wan.ppo.policy import Net, AudioNavBaselinePolicy, Policy +from ss_baselines.av_nav.ppo.ppo import PPO + +__all__ = ["PPO", "Policy", "Net", "AudioNavBaselinePolicy"] diff --git a/ss_baselines/av_wan/ppo/policy.py b/ss_baselines/av_wan/ppo/policy.py new file mode 100644 index 0000000..f24ccfa --- /dev/null +++ b/ss_baselines/av_wan/ppo/policy.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import abc +import logging + +import torch +import torch.nn as nn +from torchsummary import summary + +from ss_baselines.common.utils import CategoricalNetWithMask +from ss_baselines.av_nav.models.rnn_state_encoder import RNNStateEncoder +from ss_baselines.av_wan.models.visual_cnn import VisualCNN +from ss_baselines.av_wan.models.map_cnn import MapCNN +from ss_baselines.av_wan.models.audio_cnn import AudioCNN + +DUAL_GOAL_DELIMITER = ',' + + +class Policy(nn.Module): + def __init__(self, net, dim_actions, masking=True): + super().__init__() + self.net = net + self.dim_actions = dim_actions + + self.action_distribution = CategoricalNetWithMask( + self.net.output_size, self.dim_actions, masking + ) + self.critic = CriticHead(self.net.output_size) + + def forward(self, *x): + raise NotImplementedError + + def act( + self, + observations, + rnn_hidden_states, + prev_actions, + masks, + deterministic=False, + ): + features, rnn_hidden_states = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + distribution = self.action_distribution(features, observations['action_map']) + value = self.critic(features) + + if deterministic: + action = distribution.mode() + else: + action = distribution.sample() + + action_log_probs = distribution.log_probs(action) + + return value, action, action_log_probs, rnn_hidden_states, distribution + + def get_value(self, observations, rnn_hidden_states, prev_actions, masks): + features, _ = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + return self.critic(features) + + def evaluate_actions( + self, observations, rnn_hidden_states, prev_actions, masks, action + ): + features, rnn_hidden_states = self.net( + observations, rnn_hidden_states, prev_actions, masks + ) + distribution = self.action_distribution(features, observations['action_map']) + value = self.critic(features) + + action_log_probs = distribution.log_probs(action) + distribution_entropy = distribution.entropy().mean() + + return value, action_log_probs, distribution_entropy, rnn_hidden_states + + +class CriticHead(nn.Module): + def __init__(self, input_size): + super().__init__() + self.fc = nn.Linear(input_size, 1) + nn.init.orthogonal_(self.fc.weight) + nn.init.constant_(self.fc.bias, 0) + + def forward(self, x): + return self.fc(x) + + +class AudioNavBaselinePolicy(Policy): + def __init__( + self, + observation_space, + goal_sensor_uuid, + masking, + action_map_size, + hidden_size=512, + encode_rgb=False, + encode_depth=False + ): + super().__init__( + AudioNavBaselineNet( + observation_space=observation_space, + hidden_size=hidden_size, + goal_sensor_uuid=goal_sensor_uuid, + encode_rgb=encode_rgb, + encode_depth=encode_depth + ), + # action_space.n, + action_map_size ** 2, + masking=masking + ) + + +class Net(nn.Module, metaclass=abc.ABCMeta): + @abc.abstractmethod + def forward(self, observations, rnn_hidden_states, prev_actions, masks): + pass + + @property + @abc.abstractmethod + def output_size(self): + pass + + @property + @abc.abstractmethod + def num_recurrent_layers(self): + pass + + @property + @abc.abstractmethod + def is_blind(self): + pass + + +class AudioNavBaselineNet(Net): + r"""Network which passes the input image through CNN and concatenates + goal vector with CNN's output and passes that through RNN. + """ + + def __init__(self, observation_space, hidden_size, goal_sensor_uuid, encode_rgb, encode_depth): + super().__init__() + self.goal_sensor_uuid = goal_sensor_uuid + self._hidden_size = hidden_size + self._spectrogram = False + self._gm = 'gm' in observation_space.spaces + self._am = 'am' in observation_space.spaces + + self._spectrogram = 'spectrogram' == self.goal_sensor_uuid + self.visual_encoder = VisualCNN(observation_space, hidden_size, encode_rgb, encode_depth) + if self._spectrogram: + self.audio_encoder = AudioCNN(observation_space, hidden_size) + if self._gm: + self.gm_encoder = MapCNN(observation_space, hidden_size, map_type='gm') + if self._am: + self.am_encoder = MapCNN(observation_space, hidden_size, map_type='am') + + rnn_input_size = (0 if self.is_blind else self._hidden_size) + \ + (self._hidden_size if self._spectrogram else 0) + \ + (self._hidden_size if self._gm else 0) + \ + (self._hidden_size if self._am else 0) + self.state_encoder = RNNStateEncoder(rnn_input_size, self._hidden_size) + + if 'rgb' in observation_space.spaces and encode_rgb: + rgb_shape = observation_space.spaces['rgb'].shape + summary(self.visual_encoder.cnn, (rgb_shape[2], rgb_shape[0], rgb_shape[1]), device='cpu') + if 'depth' in observation_space.spaces and encode_depth: + depth_shape = observation_space.spaces['depth'].shape + summary(self.visual_encoder.cnn, (depth_shape[2], depth_shape[0], depth_shape[1]), device='cpu') + if 'spectrogram' in observation_space.spaces: + audio_shape = observation_space.spaces['spectrogram'].shape + summary(self.audio_encoder.cnn, (audio_shape[2], audio_shape[0], audio_shape[1]), device='cpu') + if self._gm: + gm_shape = observation_space.spaces['gm'].shape + summary(self.gm_encoder.cnn, (gm_shape[2], gm_shape[0], gm_shape[1]), device='cpu') + if self._am: + am_shape = observation_space.spaces['am'].shape + summary(self.am_encoder.cnn, (am_shape[2], am_shape[0], am_shape[1]), device='cpu') + + self.train() + + @property + def output_size(self): + return self._hidden_size + + @property + def is_blind(self): + return self.visual_encoder.is_blind + + @property + def num_recurrent_layers(self): + return self.state_encoder.num_recurrent_layers + + def forward(self, observations, rnn_hidden_states, prev_actions, masks): + x = [] + + if self._spectrogram: + x.append(self.audio_encoder(observations)) + if self._gm: + x.append(self.gm_encoder(observations)) + if self._am: + x.append(self.am_encoder(observations)) + if not self.is_blind: + x.append(self.visual_encoder(observations)) + + x1 = torch.cat(x, dim=1) + x2, rnn_hidden_states1 = self.state_encoder(x1, rnn_hidden_states, masks) + + assert not torch.isnan(x2).any().item() + + return x2, rnn_hidden_states1 diff --git a/ss_baselines/av_wan/ppo/ppo_trainer.py b/ss_baselines/av_wan/ppo/ppo_trainer.py new file mode 100644 index 0000000..8b6419a --- /dev/null +++ b/ss_baselines/av_wan/ppo/ppo_trainer.py @@ -0,0 +1,751 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +import time +import logging +from collections import deque +from typing import Dict, List +import json +import random + +import numpy as np +import torch +from torch.optim.lr_scheduler import LambdaLR +from tqdm import tqdm +from numpy.linalg import norm + +from habitat import Config, logger +from ss_baselines.common.base_trainer import BaseRLTrainer +from ss_baselines.common.baseline_registry import baseline_registry +from ss_baselines.common.env_utils import construct_envs +from ss_baselines.common.environments import get_env_class +from ss_baselines.common.rollout_storage import RolloutStorage +from ss_baselines.common.tensorboard_utils import TensorboardWriter +from ss_baselines.common.utils import ( + batch_obs, + generate_video, + linear_decay, + exponential_decay, + plot_top_down_map, + resize_observation +) +from ss_baselines.av_wan.ppo import AudioNavBaselinePolicy +from ss_baselines.av_wan.ppo import PPO + + +@baseline_registry.register_trainer(name="AVWanTrainer") +class PPOTrainer(BaseRLTrainer): + r"""Trainer class for PPO algorithm + Paper: https://arxiv.org/abs/1707.06347. + """ + supported_tasks = ["Nav-v0"] + + def __init__(self, config=None): + super().__init__(config) + self.actor_critic = None + self.agent = None + self.envs = None + + def _setup_actor_critic_agent(self, ppo_cfg: Config, observation_space=None) -> None: + r"""Sets up actor critic and agent for PPO. + + Args: + ppo_cfg: config node with relevant params + + Returns: + None + """ + logger.add_filehandler(self.config.LOG_FILE) + + if observation_space is None: + observation_space = self.envs.observation_spaces[0] + self.actor_critic = AudioNavBaselinePolicy( + observation_space=observation_space, + hidden_size=ppo_cfg.hidden_size, + goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, + masking=self.config.MASKING, + encode_rgb=self.config.ENCODE_RGB, + encode_depth=self.config.ENCODE_DEPTH, + action_map_size=self.config.TASK_CONFIG.TASK.ACTION_MAP.MAP_SIZE + ) + self.actor_critic.to(self.device) + + self.agent = PPO( + actor_critic=self.actor_critic, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + ) + + def save_checkpoint(self, file_name: str) -> None: + r"""Save checkpoint with specified name. + + Args: + file_name: file name for checkpoint + + Returns: + None + """ + checkpoint = { + "state_dict": self.agent.state_dict(), + "config": self.config, + } + torch.save( + checkpoint, os.path.join(self.config.CHECKPOINT_FOLDER, file_name) + ) + + def load_checkpoint(self, checkpoint_path: str, *args, **kwargs) -> Dict: + r"""Load checkpoint of specified path as a dict. + + Args: + checkpoint_path: path of target checkpoint + *args: additional positional args + **kwargs: additional keyword args + + Returns: + dict containing checkpoint info + """ + return torch.load(checkpoint_path, *args, **kwargs) + + def _collect_rollout_step( + self, rollouts, current_episode_reward, current_episode_step, episode_rewards, + episode_spls, episode_counts, episode_steps, episode_distances + ): + pth_time = 0.0 + env_time = 0.0 + + t_sample_action = time.time() + # sample actions + with torch.no_grad(): + step_observation = { + k: v[rollouts.step] for k, v in rollouts.observations.items() + } + + ( + values, + actions, + actions_log_probs, + recurrent_hidden_states, + distributions + ) = self.actor_critic.act( + step_observation, + rollouts.recurrent_hidden_states[rollouts.step], + rollouts.prev_actions[rollouts.step], + rollouts.masks[rollouts.step] + ) + + pth_time += time.time() - t_sample_action + t_step_env = time.time() + + outputs = self.envs.step([{"action": a[0].item()} for a in actions]) + observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] + + logging.debug('Reward: {}'.format(rewards[0])) + + env_time += time.time() - t_step_env + + t_update_stats = time.time() + batch = batch_obs(observations) + rewards = torch.tensor(rewards, dtype=torch.float) + rewards = rewards.unsqueeze(1) + + masks = torch.tensor( + [[0.0] if done else [1.0] for done in dones], dtype=torch.float + ) + spls = torch.tensor( + [[info['spl']] for info in infos] + ) + + distances = torch.tensor( + [[info['distance_to_goal']] for info in infos] + ) + + current_episode_reward += rewards + current_episode_step += 1 + # current_episode_reward is accumulating rewards across multiple updates, + # as long as the current episode is not finished + # the current episode reward is added to the episode rewards only if the current episode is done + # the episode count will also increase by 1 + episode_rewards += (1 - masks) * current_episode_reward + episode_spls += (1 - masks) * spls + episode_steps += (1 - masks) * current_episode_step + episode_counts += 1 - masks + episode_distances += (1 - masks) * distances + current_episode_reward *= masks + current_episode_step *= masks + + rollouts.insert( + batch, + recurrent_hidden_states, + actions, + actions_log_probs, + values, + rewards, + masks + ) + + pth_time += time.time() - t_update_stats + + return pth_time, env_time, self.envs.num_envs + + def _update_agent(self, ppo_cfg, rollouts): + t_update_model = time.time() + with torch.no_grad(): + last_observation = { + k: v[-1] for k, v in rollouts.observations.items() + } + next_value = self.actor_critic.get_value( + last_observation, + rollouts.recurrent_hidden_states[-1], + rollouts.prev_actions[-1], + rollouts.masks[-1], + ).detach() + + rollouts.compute_returns( + next_value, ppo_cfg.use_gae, ppo_cfg.gamma, ppo_cfg.tau + ) + + value_loss, action_loss, dist_entropy = self.agent.update(rollouts) + + rollouts.after_update() + + return ( + time.time() - t_update_model, + value_loss, + action_loss, + dist_entropy, + ) + + def train(self) -> None: + r"""Main method for training PPO. + + Returns: + None + """ + global lr_lambda + logger.info(f"config: {self.config}") + random.seed(self.config.SEED) + np.random.seed(self.config.SEED) + torch.manual_seed(self.config.SEED) + + self.envs = construct_envs( + self.config, get_env_class(self.config.ENV_NAME), auto_reset_done=False + ) + + ppo_cfg = self.config.RL.PPO + self.device = ( + torch.device("cuda", self.config.TORCH_GPU_ID) + if torch.cuda.is_available() + else torch.device("cpu") + ) + if not os.path.isdir(self.config.CHECKPOINT_FOLDER): + os.makedirs(self.config.CHECKPOINT_FOLDER) + self._setup_actor_critic_agent(ppo_cfg) + logger.info( + "agent number of parameters: {}".format( + sum(param.numel() for param in self.agent.parameters()) + ) + ) + + rollouts = RolloutStorage( + ppo_cfg.num_steps, + self.envs.num_envs, + self.envs.observation_spaces[0], + self.envs.action_spaces[0], + ppo_cfg.hidden_size + ) + rollouts.to(self.device) + + observations = self.envs.reset() + batch = batch_obs(observations) + + for sensor in rollouts.observations: + rollouts.observations[sensor][0].copy_(batch[sensor]) + + # batch and observations may contain shared PyTorch CUDA + # tensors. We must explicitly clear them here otherwise + # they will be kept in memory for the entire duration of training! + batch = None + observations = None + + # episode_rewards and episode_counts accumulates over the entire training course + episode_rewards = torch.zeros(self.envs.num_envs, 1) + episode_spls = torch.zeros(self.envs.num_envs, 1) + episode_steps = torch.zeros(self.envs.num_envs, 1) + episode_counts = torch.zeros(self.envs.num_envs, 1) + episode_distances = torch.zeros(self.envs.num_envs, 1) + current_episode_reward = torch.zeros(self.envs.num_envs, 1) + current_episode_step = torch.zeros(self.envs.num_envs, 1) + window_episode_reward = deque(maxlen=ppo_cfg.reward_window_size) + window_episode_spl = deque(maxlen=ppo_cfg.reward_window_size) + window_episode_step = deque(maxlen=ppo_cfg.reward_window_size) + window_episode_counts = deque(maxlen=ppo_cfg.reward_window_size) + window_episode_distances = deque(maxlen=ppo_cfg.reward_window_size) + + t_start = time.time() + env_time = 0 + pth_time = 0 + count_steps = 0 + count_checkpoints = 0 + start_update = 0 + prev_time = 0 + + if ppo_cfg.use_linear_lr_decay: + def lr_lambda(x): + return linear_decay(x, self.config.NUM_UPDATES) + elif ppo_cfg.use_exponential_lr_decay: + def lr_lambda(x): + return exponential_decay(x, self.config.NUM_UPDATES, ppo_cfg.exp_decay_lambda) + else: + def lr_lambda(x): + return 1 + lr_scheduler = LambdaLR( + optimizer=self.agent.optimizer, + lr_lambda=lr_lambda + ) + + with TensorboardWriter( + self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) as writer: + for update in range(start_update, self.config.NUM_UPDATES): + if ppo_cfg.use_linear_lr_decay or ppo_cfg.use_exponential_lr_decay: + lr_scheduler.step() + + if ppo_cfg.use_linear_clip_decay: + self.agent.clip_param = ppo_cfg.clip_param * linear_decay( + update, self.config.NUM_UPDATES + ) + + for step in range(ppo_cfg.num_steps): + delta_pth_time, delta_env_time, delta_steps = self._collect_rollout_step( + rollouts, + current_episode_reward, + current_episode_step, + episode_rewards, + episode_spls, + episode_counts, + episode_steps, + episode_distances + ) + pth_time += delta_pth_time + env_time += delta_env_time + count_steps += delta_steps + + delta_pth_time, value_loss, action_loss, dist_entropy = self._update_agent( + ppo_cfg, rollouts + ) + pth_time += delta_pth_time + + window_episode_reward.append(episode_rewards.clone()) + window_episode_spl.append(episode_spls.clone()) + window_episode_step.append(episode_steps.clone()) + window_episode_counts.append(episode_counts.clone()) + window_episode_distances.append(episode_distances.clone()) + + losses = [value_loss, action_loss, dist_entropy] + stats = zip( + ["count", "reward", "step", 'spl', 'distance'], + [window_episode_counts, window_episode_reward, window_episode_step, window_episode_spl, + window_episode_distances], + ) + deltas = { + k: ( + (v[-1] - v[0]).sum().item() + if len(v) > 1 + else v[0].sum().item() + ) + for k, v in stats + } + deltas["count"] = max(deltas["count"], 1.0) + + # this reward is averaged over all the episodes happened during window_size updates + # approximately number of steps is window_size * num_steps + writer.add_scalar( + "Environment/Reward", deltas["reward"] / deltas["count"], count_steps + ) + + writer.add_scalar( + "Environment/SPL", deltas["spl"] / deltas["count"], count_steps + ) + + logging.debug('Number of steps: {}'.format(deltas["step"] / deltas["count"])) + writer.add_scalar( + "Environment/Episode_length", deltas["step"] / deltas["count"], count_steps + ) + + writer.add_scalar( + "Environment/Distance_to_goal", deltas["distance"] / deltas["count"], count_steps + ) + + # writer.add_scalars( + # "losses", + # {k: l for l, k in zip(losses, ["value", "policy"])}, + # count_steps, + # ) + + writer.add_scalar( + 'Policy/Value_Loss', value_loss, count_steps + ) + writer.add_scalar( + 'Policy/Action_Loss', action_loss, count_steps + ) + writer.add_scalar( + 'Policy/Entropy', dist_entropy, count_steps + ) + writer.add_scalar( + 'Policy/Learning_Rate', lr_scheduler.get_lr()[0], count_steps + ) + + # log stats + if update > 0 and update % self.config.LOG_INTERVAL == 0: + logger.info( + "update: {}\tfps: {:.3f}\t".format( + update, count_steps / ((time.time() - t_start) + prev_time) + ) + ) + + logger.info( + "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" + "frames: {}".format( + update, env_time, pth_time, count_steps + ) + ) + + window_rewards = ( + window_episode_reward[-1] - window_episode_reward[0] + ).sum() + window_counts = ( + window_episode_counts[-1] - window_episode_counts[0] + ).sum() + + if window_counts > 0: + logger.info( + "Average window size {} reward: {:3f}".format( + len(window_episode_reward), + (window_rewards / window_counts).item(), + ) + ) + else: + logger.info("No episodes finish in current window") + + # checkpoint model + if update % self.config.CHECKPOINT_INTERVAL == 0: + self.save_checkpoint(f"ckpt.{count_checkpoints}.pth") + count_checkpoints += 1 + + self.envs.close() + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0 + ) -> Dict: + r"""Evaluates a single checkpoint. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + random.seed(self.config.SEED) + np.random.seed(self.config.SEED) + torch.manual_seed(self.config.SEED) + + # Map location CPU is almost always better than mapping to a CUDA device. + ckpt_dict = self.load_checkpoint(checkpoint_path, map_location="cpu") + + if self.config.EVAL.USE_CKPT_CONFIG: + config = self._setup_eval_config(ckpt_dict["config"]) + else: + config = self.config.clone() + + ppo_cfg = config.RL.PPO + + config.defrost() + config.TASK_CONFIG.DATASET.SPLIT = config.EVAL.SPLIT + if self.config.DISPLAY_RESOLUTION != config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.WIDTH: + model_resolution = config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.WIDTH + config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.WIDTH = config.TASK_CONFIG.SIMULATOR.RGB_SENSOR.HEIGHT = \ + config.TASK_CONFIG.SIMULATOR.RGB_SENSOR.WIDTH = config.TASK_CONFIG.SIMULATOR.DEPTH_SENSOR.HEIGHT = \ + self.config.DISPLAY_RESOLUTION + else: + model_resolution = self.config.DISPLAY_RESOLUTION + config.freeze() + + if len(self.config.VIDEO_OPTION) > 0: + config.defrost() + config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.TASK_CONFIG.TASK.MEASUREMENTS.append("COLLISIONS") + config.TASK_CONFIG.TASK.SENSORS.append("AUDIOGOAL_SENSOR") + config.freeze() + elif "top_down_map" in self.config.VISUALIZATION_OPTION: + config.defrost() + config.TASK_CONFIG.TASK.MEASUREMENTS.append("TOP_DOWN_MAP") + config.freeze() + + logger.info(f"env config: {config}") + self.envs = construct_envs( + config, get_env_class(config.ENV_NAME), auto_reset_done=False + ) + if self.config.DISPLAY_RESOLUTION != model_resolution: + observation_space = self.envs.observation_spaces[0] + observation_space.spaces['depth'].shape = (model_resolution, model_resolution, 1) + observation_space.spaces['rgb'].shape = (model_resolution, model_resolution, 3) + else: + observation_space = self.envs.observation_spaces[0] + self._setup_actor_critic_agent(ppo_cfg) + + self.agent.load_state_dict(ckpt_dict["state_dict"]) + self.actor_critic = self.agent.actor_critic + + self.metric_uuids = [] + for metric_name in self.config.TASK_CONFIG.TASK.MEASUREMENTS: + metric_cfg = getattr(self.config.TASK_CONFIG.TASK, metric_name) + measure_type = baseline_registry.get_measure(metric_cfg.TYPE) + assert measure_type is not None, "invalid measurement type {}".format( + metric_cfg.TYPE + ) + self.metric_uuids.append(measure_type(sim=None, task=None, config=None)._get_uuid()) + + observations = self.envs.reset() + if self.config.DISPLAY_RESOLUTION != model_resolution: + resize_observation(observations, model_resolution) + batch = batch_obs(observations, self.device) + + current_episode_reward = torch.zeros( + self.envs.num_envs, 1, device=self.device + ) + current_episode_reaching_waypoint = torch.zeros( + self.envs.num_envs, 1, device=self.device + ) + current_episode_cant_reach_waypoint = torch.zeros( + self.envs.num_envs, 1, device=self.device + ) + current_episode_step_count = torch.zeros( + self.envs.num_envs, 1, device=self.device + ) + + test_recurrent_hidden_states = torch.zeros( + self.actor_critic.net.num_recurrent_layers, + self.config.NUM_PROCESSES, + ppo_cfg.hidden_size, + device=self.device, + ) + prev_actions = torch.zeros( + self.config.NUM_PROCESSES, 2, device=self.device, dtype=torch.long + ) + not_done_masks = torch.zeros( + self.config.NUM_PROCESSES, 1, device=self.device + ) + stats_episodes = dict() # dict of dicts that stores stats per episode + + rgb_frames = [ + [] for _ in range(self.config.NUM_PROCESSES) + ] # type: List[List[np.ndarray]] + audios = [ + [] for _ in range(self.config.NUM_PROCESSES) + ] + if len(self.config.VIDEO_OPTION) > 0: + os.makedirs(self.config.VIDEO_DIR, exist_ok=True) + + t = tqdm(total=self.config.TEST_EPISODE_COUNT) + while ( + len(stats_episodes) < self.config.TEST_EPISODE_COUNT + and self.envs.num_envs > 0 + ): + current_episodes = self.envs.current_episodes() + + with torch.no_grad(): + _, actions, _, test_recurrent_hidden_states, distributions = self.actor_critic.act( + batch, + test_recurrent_hidden_states, + prev_actions, + not_done_masks, + deterministic=True + ) + + prev_actions.copy_(actions) + + outputs = self.envs.step([{"action": a[0].item()} for a in actions]) + observations, rewards, dones, infos = [list(x) for x in zip(*outputs)] + if config.DISPLAY_RESOLUTION != model_resolution: + resize_observation(observations, model_resolution) + + batch = batch_obs(observations, self.device) + if len(self.config.VIDEO_OPTION) > 0: + rgb_frames[0] += infos[0]['rgb_frames'] + audios[0] += infos[0]['audios'] + + not_done_masks = torch.tensor( + [[0.0] if done else [1.0] for done in dones], + dtype=torch.float, + device=self.device, + ) + logging.debug('Reward: {}'.format(rewards[0])) + + rewards = torch.tensor( + rewards, dtype=torch.float, device=self.device + ).unsqueeze(1) + current_episode_reward += rewards + current_episode_step_count += 1 + next_episodes = self.envs.current_episodes() + n_envs = self.envs.num_envs + envs_to_pause = [] + for i in range(n_envs): + # pause envs which runs out of episodes + if ( + next_episodes[i].scene_id, + next_episodes[i].episode_id, + ) in stats_episodes: + envs_to_pause.append(i) + logging.info('Pause env {} and remaining number of envs: {}'.format(i, self.envs.num_envs)) + + current_episode_reaching_waypoint[i] += infos[i]['reaching_waypoint'] + current_episode_cant_reach_waypoint[i] += infos[i]['cant_reach_waypoint'] + + # episode ended + if not_done_masks[i].item() == 0: + episode_stats = dict() + for metric_uuid in self.metric_uuids: + episode_stats[metric_uuid] = infos[i][metric_uuid] + episode_stats["reward"] = current_episode_reward[i].item() + episode_stats['geodesic_distance'] = current_episodes[i].info['geodesic_distance'] + episode_stats['euclidean_distance'] = norm(np.array(current_episodes[i].goals[0].position) - + np.array(current_episodes[i].start_position)) + episode_stats["reaching_waypoint"] = current_episode_reaching_waypoint[i].item() / \ + current_episode_step_count[i].item() + episode_stats["cant_reach_waypoint"] = current_episode_cant_reach_waypoint[i].item() / \ + current_episode_step_count[i].item() + current_episode_reaching_waypoint[i] = 0 + current_episode_cant_reach_waypoint[i] = 0 + current_episode_step_count[i] = 0 + current_episode_reward[i] = 0 + logging.debug(episode_stats) + # use scene_id + episode_id as unique id for storing stats + stats_episodes[ + ( + current_episodes[i].scene_id, + current_episodes[i].episode_id, + ) + ] = episode_stats + t.update() + + if len(self.config.VIDEO_OPTION) > 0: + if self.config.VISUALIZE_FAILURE_ONLY and infos[i]['success'] > 0: + pass + else: + fps = self.config.TASK_CONFIG.SIMULATOR.VIEW_CHANGE_FPS \ + if self.config.TASK_CONFIG.SIMULATOR.CONTINUOUS_VIEW_CHANGE else 1 + if 'sound' in current_episodes[i].info: + sound = current_episodes[i].info['sound'] + else: + sound = current_episodes[i].sound_id.split('/')[1][:-4] + generate_video( + video_option=self.config.VIDEO_OPTION, + video_dir=self.config.VIDEO_DIR, + images=rgb_frames[i][:-1], + scene_name=current_episodes[i].scene_id.split('/')[3], + sound=sound, + sr=self.config.TASK_CONFIG.SIMULATOR.AUDIO.RIR_SAMPLING_RATE, + episode_id=current_episodes[i].episode_id, + checkpoint_idx=checkpoint_index, + metric_name='spl', + metric_value=infos[i]['spl'], + tb_writer=writer, + audios=audios[i][:-1], + fps=fps + ) + + rgb_frames[i] = [] + audios[i] = [] + + if "top_down_map" in self.config.VISUALIZATION_OPTION: + top_down_map = plot_top_down_map(infos[i]) + scene = current_episodes[i].scene_id.split('/')[-3] + writer.add_image('{}_{}_{}/{}'.format(config.EVAL.SPLIT, scene, current_episodes[i].episode_id, + config.BASE_TASK_CONFIG_PATH.split('/')[-1][:-5]), + top_down_map, + dataformats='WHC') + + ( + self.envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) = self._pause_envs( + envs_to_pause, + self.envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + ) + + aggregated_stats = dict() + for stat_key in next(iter(stats_episodes.values())).keys(): + aggregated_stats[stat_key] = sum( + [v[stat_key] for v in stats_episodes.values()] + ) + num_episodes = len(stats_episodes) + + stats_file = os.path.join(config.TENSORBOARD_DIR, '{}_stats_{}.json'.format(config.EVAL.SPLIT, config.SEED)) + new_stats_episodes = {','.join(key): value for key, value in stats_episodes.items()} + with open(stats_file, 'w') as fo: + json.dump(new_stats_episodes, fo) + + episode_reward_mean = aggregated_stats["reward"] / num_episodes + episode_reaching_waypoint_mean = aggregated_stats["reaching_waypoint"] / num_episodes + episode_cant_reach_waypoint_mean = aggregated_stats["cant_reach_waypoint"] / num_episodes + episode_metrics_mean = {} + for metric_uuid in self.metric_uuids: + episode_metrics_mean[metric_uuid] = aggregated_stats[metric_uuid] / num_episodes + + logger.info(f"Average episode reward: {episode_reward_mean:.6f}") + logger.info(f"Average episode reaching_waypoint: {episode_reaching_waypoint_mean:.6f}") + logger.info(f"Average episode cant_reach_waypoint: {episode_cant_reach_waypoint_mean:.6f}") + for metric_uuid in self.metric_uuids: + logger.info( + f"Average episode {metric_uuid}: {episode_metrics_mean[metric_uuid]:.6f}" + ) + + if not config.EVAL.SPLIT.startswith('test'): + writer.add_scalar("{}/reward".format(config.EVAL.SPLIT), episode_reward_mean, checkpoint_index) + writer.add_scalar("{}/reaching_waypoint".format(config.EVAL.SPLIT), episode_reaching_waypoint_mean, + checkpoint_index) + writer.add_scalar("{}/cant_reach_waypoint".format(config.EVAL.SPLIT), episode_cant_reach_waypoint_mean, + checkpoint_index) + for metric_uuid in self.metric_uuids: + writer.add_scalar(f"{config.EVAL.SPLIT}/{metric_uuid}", episode_metrics_mean[metric_uuid], + checkpoint_index) + + self.envs.close() + + result = { + 'episode_reward_mean': episode_reward_mean, + 'episode_reaching_waypoint_mean': episode_reaching_waypoint_mean, + 'episode_cant_reach_waypoint_mean': episode_cant_reach_waypoint_mean + } + for metric_uuid in self.metric_uuids: + result['episode_{}_mean'.format(metric_uuid)] = episode_metrics_mean[metric_uuid] + + return result diff --git a/ss_baselines/av_wan/run.py b/ss_baselines/av_wan/run.py new file mode 100644 index 0000000..a93f1a8 --- /dev/null +++ b/ss_baselines/av_wan/run.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import argparse +import logging +import os + +import warnings +warnings.filterwarnings('ignore', category=FutureWarning) +warnings.filterwarnings('ignore', category=UserWarning) +import tensorflow as tf +import torch + +import soundspaces +from ss_baselines.common.baseline_registry import baseline_registry +from ss_baselines.av_wan.config.default import get_config + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--run-type", + choices=["train", "eval"], + # required=True, + default='train', + help="run type of the experiment (train or eval)", + ) + parser.add_argument( + "--exp-config", + type=str, + # required=True, + default='av_wan/config/pointnav_rgb.yaml', + help="path to config yaml containing info about experiment", + ) + parser.add_argument( + "opts", + default=None, + nargs=argparse.REMAINDER, + help="Modify config options from command line", + ) + parser.add_argument( + "--model-dir", + default=None, + help="Modify config options from command line", + ) + parser.add_argument( + "--overwrite", + default=False, + action='store_true', + help="Modify config options from command line" + ) + parser.add_argument( + "--eval-interval", + type=int, + default=1, + help="Evaluation interval of checkpoints", + ) + parser.add_argument( + "--prev-ckpt-ind", + type=int, + default=-1, + help="Evaluation interval of checkpoints", + ) + parser.add_argument( + "--eval-best", + default=False, + action='store_true', + help="Modify config options from command line" + ) + args = parser.parse_args() + + if args.eval_best: + best_ckpt_idx = find_best_ckpt_idx(os.path.join(args.model_dir, 'tb')) + best_ckpt_path = os.path.join(args.model_dir, 'data', f'ckpt.{best_ckpt_idx}.pth') + print(f'Evaluating the best checkpoint: {best_ckpt_path}') + args.opts += ['EVAL_CKPT_PATH_DIR', best_ckpt_path] + + # run exp + config = get_config(args.exp_config, args.opts, args.model_dir, args.run_type, args.overwrite) + trainer_init = baseline_registry.get_trainer(config.TRAINER_NAME) + assert trainer_init is not None, f"{config.TRAINER_NAME} is not supported" + trainer = trainer_init(config) + torch.set_num_threads(1) + + level = logging.DEBUG if config.DEBUG else logging.INFO + logging.basicConfig(level=level, format='%(asctime)s, %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S") + + if args.run_type == "train": + trainer.train() + elif args.run_type == "eval": + trainer.eval(args.eval_interval, args.prev_ckpt_ind, config.USE_LAST_CKPT) + + +def find_best_ckpt_idx(event_dir_path, min_step=-1, max_step=10000): + events = os.listdir(event_dir_path) + + max_value = 0 + max_index = -1 + for event in events: + if "events" not in event: + continue + iterator = tf.compat.v1.train.summary_iterator(os.path.join(event_dir_path, event)) + for e in iterator: + if len(e.summary.value) == 0: + continue + if not e.summary.value[0].tag.startswith('val'): + break + if 'spl' not in e.summary.value[0].tag: + continue + if not min_step <= e.step <= max_step: + continue + if len(e.summary.value) > 0 and e.summary.value[0].simple_value > max_value: + max_value = e.summary.value[0].simple_value + max_index = e.step + + if max_index == -1: + print('No max index is found in {}'.format(event_dir_path)) + else: + print('The best index in {} is {}'.format(event_dir_path, max_index)) + + return max_index + + +if __name__ == "__main__": + main() diff --git a/ss_baselines/common/__init__.py b/ss_baselines/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/common/base_trainer.py b/ss_baselines/common/base_trainer.py new file mode 100644 index 0000000..8930c0d --- /dev/null +++ b/ss_baselines/common/base_trainer.py @@ -0,0 +1,289 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +import os +import time +from typing import ClassVar, Dict, List +import glob + +import torch + +from habitat import Config, logger +from ss_baselines.common.tensorboard_utils import TensorboardWriter +from ss_baselines.common.utils import poll_checkpoint_folder + + +class BaseTrainer: + r"""Generic trainer class that serves as a base template for more + specific trainer classes like RL trainer, SLAM or imitation learner. + Includes only the most basic functionality. + """ + + supported_tasks: ClassVar[List[str]] + + def train(self) -> None: + raise NotImplementedError + + def eval(self) -> None: + raise NotImplementedError + + def save_checkpoint(self, file_name) -> None: + raise NotImplementedError + + def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict: + raise NotImplementedError + + +class BaseRLTrainer(BaseTrainer): + r"""Base trainer class for RL trainers. Future RL-specific + methods should be hosted here. + """ + device: torch.device + config: Config + video_option: List[str] + _flush_secs: int + + def __init__(self, config: Config): + super().__init__() + assert config is not None, "needs config file to initialize trainer" + self.config = config + self._flush_secs = 30 + + @property + def flush_secs(self): + return self._flush_secs + + @flush_secs.setter + def flush_secs(self, value: int): + self._flush_secs = value + + def train(self) -> None: + raise NotImplementedError + + def eval(self, eval_interval=1, prev_ckpt_ind=-1, use_last_ckpt=False) -> None: + r"""Main method of trainer evaluation. Calls _eval_checkpoint() that + is specified in Trainer class that inherits from BaseRLTrainer + + Returns: + None + """ + self.device = ( + torch.device("cuda", self.config.TORCH_GPU_ID) + if torch.cuda.is_available() + else torch.device("cpu") + ) + + if "tensorboard" in self.config.VIDEO_OPTION: + assert ( + len(self.config.TENSORBOARD_DIR) > 0 + ), "Must specify a tensorboard directory for video display" + if "disk" in self.config.VIDEO_OPTION: + assert ( + len(self.config.VIDEO_DIR) > 0 + ), "Must specify a directory for storing videos on disk" + + with TensorboardWriter( + self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) as writer: + # eval last checkpoint in the folder + if use_last_ckpt: + models_paths = list( + filter(os.path.isfile, glob.glob(self.config.EVAL_CKPT_PATH_DIR + "/*")) + ) + models_paths.sort(key=os.path.getmtime) + self.config.defrost() + self.config.EVAL_CKPT_PATH_DIR = models_paths[-1] + self.config.freeze() + + if os.path.isfile(self.config.EVAL_CKPT_PATH_DIR): + # evaluate single checkpoint + result = self._eval_checkpoint(self.config.EVAL_CKPT_PATH_DIR, writer) + return result + else: + # evaluate multiple checkpoints in order + while True: + current_ckpt = None + while current_ckpt is None: + current_ckpt = poll_checkpoint_folder( + self.config.EVAL_CKPT_PATH_DIR, prev_ckpt_ind, eval_interval + ) + time.sleep(2) # sleep for 2 secs before polling again + logger.info(f"=======current_ckpt: {current_ckpt}=======") + prev_ckpt_ind += eval_interval + self._eval_checkpoint( + checkpoint_path=current_ckpt, + writer=writer, + checkpoint_index=prev_ckpt_ind + ) + + def _setup_eval_config(self, checkpoint_config: Config) -> Config: + r"""Sets up and returns a merged config for evaluation. Config + object saved from checkpoint is merged into config file specified + at evaluation time with the following overwrite priority: + eval_opts > ckpt_opts > eval_cfg > ckpt_cfg + If the saved config is outdated, only the eval config is returned. + + Args: + checkpoint_config: saved config from checkpoint. + + Returns: + Config: merged config for eval. + """ + + config = self.config.clone() + + ckpt_cmd_opts = checkpoint_config.CMD_TRAILING_OPTS + eval_cmd_opts = config.CMD_TRAILING_OPTS + + try: + config.merge_from_other_cfg(checkpoint_config) + config.merge_from_other_cfg(self.config) + config.merge_from_list(ckpt_cmd_opts) + config.merge_from_list(eval_cmd_opts) + except KeyError: + logger.info("Saved config is outdated, using solely eval config") + config = self.config.clone() + config.merge_from_list(eval_cmd_opts) + + config.TASK_CONFIG.SIMULATOR.AGENT_0.defrost() + config.TASK_CONFIG.SIMULATOR.AGENT_0.SENSORS = self.config.SENSORS + config.freeze() + + return config + + def _eval_checkpoint( + self, + checkpoint_path: str, + writer: TensorboardWriter, + checkpoint_index: int = 0, + ) -> None: + r"""Evaluates a single checkpoint. Trainer algorithms should + implement this. + + Args: + checkpoint_path: path of checkpoint + writer: tensorboard writer object for logging to tensorboard + checkpoint_index: index of cur checkpoint for logging + + Returns: + None + """ + raise NotImplementedError + + def save_checkpoint(self, file_name) -> None: + raise NotImplementedError + + def load_checkpoint(self, checkpoint_path, *args, **kwargs) -> Dict: + raise NotImplementedError + + @staticmethod + def _pause_envs( + envs_to_pause, + envs, + test_recurrent_hidden_states, + not_done_masks, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + text_frames, + track_query, + track_query_count, + track_perf, + track_agent_step, + test_em_goal=None, + test_em_option=None, + test_em_vln=None, + test_em_vln_dialog=None, + descriptor_pred_gt=None, + ): + + track_query_new = track_query + track_query_count_new = track_query_count + track_perf_new = track_perf + track_agent_step_new = track_agent_step + + # pausing self.envs with no new episode + if len(envs_to_pause) > 0: + state_index = list(range(envs.num_envs)) + for idx in reversed(envs_to_pause): + state_index.pop(idx) + envs.pause_at(idx) + if test_em_goal is not None: + test_em_goal.pop_at(idx) + if test_em_option is not None: + test_em_option.pop_at(idx) + if test_em_vln is not None: + test_em_vln.pop_at(idx) + if test_em_vln_dialog is not None: + test_em_vln_dialog.pop_at(idx) + if descriptor_pred_gt is not None: + descriptor_pred_gt.pop(idx) + + # indexing along the batch dimensions + test_recurrent_hidden_states = test_recurrent_hidden_states[ + :, state_index + ] + not_done_masks = not_done_masks[state_index] + current_episode_reward = current_episode_reward[state_index] + prev_actions = prev_actions[state_index] + + + track_query_new = [] + track_query_count_new = [] + track_perf_new = [] + track_agent_step_new = [] + for idx in state_index: + track_query_new.append(track_query[idx]) + track_query_count_new.append(track_query_count[idx]) + track_perf_new.append(track_perf[idx]) + track_agent_step_new.append(track_agent_step[idx]) + + for k, v in batch.items(): + batch[k] = v[state_index] + + rgb_frames = [rgb_frames[i] for i in state_index] + text_frames = [text_frames[i] for i in state_index] + + if test_em_option is None: + return ( + envs, + test_recurrent_hidden_states, + not_done_masks, + test_em_vln, + test_em_vln_dialog, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + text_frames, + track_query_new, + track_query_count_new, + track_perf_new, + track_agent_step_new + ) + else: + return ( + envs, + test_recurrent_hidden_states, + not_done_masks, + test_em_goal, + test_em_option, + test_em_vln, + test_em_vln_dialog, + current_episode_reward, + prev_actions, + batch, + rgb_frames, + text_frames, + track_query_new, + track_query_count_new, + track_perf_new, + track_agent_step_new, + ) diff --git a/ss_baselines/common/baseline_registry.py b/ss_baselines/common/baseline_registry.py new file mode 100644 index 0000000..c9663b5 --- /dev/null +++ b/ss_baselines/common/baseline_registry.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +r"""BaselineRegistry is extended from habitat.Registry to provide +registration for trainer and environments, while keeping Registry +in habitat core intact. + +Import the baseline registry object using + +``from av_nav.common.baseline_registry import baseline_registry`` + +Various decorators for registry different kind of classes with unique keys + +- Register a environment: ``@registry.register_env`` +- Register a trainer: ``@registry.register_trainer`` +""" + +from typing import Optional + +from habitat.core.registry import Registry + + +class BaselineRegistry(Registry): + @classmethod + def register_trainer(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a RL training algorithm to registry with key 'name'. + + Args: + name: Key with which the trainer will be registered. + If None will use the name of the class. + + """ + from ss_baselines.common.base_trainer import BaseTrainer + + return cls._register_impl( + "trainer", to_register, name, assert_type=BaseTrainer + ) + + @classmethod + def get_trainer(cls, name): + return cls._get_impl("trainer", name) + + @classmethod + def register_env(cls, to_register=None, *, name: Optional[str] = None): + r"""Register a environment to registry with key 'name' + currently only support subclass of RLEnv. + + Args: + name: Key with which the env will be registered. + If None will use the name of the class. + + """ + + return cls._register_impl("env", to_register, name) + + @classmethod + def get_env(cls, name): + return cls._get_impl("env", name) + + +baseline_registry = BaselineRegistry() diff --git a/ss_baselines/common/benchmark.py b/ss_baselines/common/benchmark.py new file mode 100644 index 0000000..0da3171 --- /dev/null +++ b/ss_baselines/common/benchmark.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +r"""Implements evaluation of ``habitat.Agent`` inside ``habitat.Env``. +``habitat.Benchmark`` creates a ``habitat.Env`` which is specified through +the ``config_env`` parameter in constructor. The evaluation is task agnostic +and is implemented through metrics defined for ``habitat.EmbodiedTask``. +""" + +from collections import defaultdict +from typing import Dict, Optional +import logging + +from tqdm import tqdm + +from habitat import Config +from habitat.core.agent import Agent +# from habitat.core.env import Env +from ss_baselines.common.environments import AudioNavRLEnv +from habitat.datasets import make_dataset + + +class Benchmark: + r"""Benchmark for evaluating agents in environments. + """ + + def __init__(self, task_config: Optional[Config] = None) -> None: + r""".. + + :param task_config: config to be used for creating the environment + """ + dummy_config = Config() + dummy_config.RL = Config() + dummy_config.RL.SLACK_REWARD = -0.01 + dummy_config.RL.SUCCESS_REWARD = 10 + dummy_config.RL.WITH_TIME_PENALTY = True + dummy_config.RL.DISTANCE_REWARD_SCALE = 1 + dummy_config.RL.WITH_DISTANCE_REWARD = True + dummy_config.RL.defrost() + dummy_config.TASK_CONFIG = task_config + dummy_config.freeze() + + dataset = make_dataset(id_dataset=task_config.DATASET.TYPE, config=task_config.DATASET) + self._env = AudioNavRLEnv(config=dummy_config, dataset=dataset) + + def evaluate( + self, agent: Agent, num_episodes: Optional[int] = None + ) -> Dict[str, float]: + r""".. + + :param agent: agent to be evaluated in environment. + :param num_episodes: count of number of episodes for which the + evaluation should be run. + :return: dict containing metrics tracked by environment. + """ + + if num_episodes is None: + num_episodes = len(self._env.episodes) + else: + assert num_episodes <= len(self._env.episodes), ( + "num_episodes({}) is larger than number of episodes " + "in environment ({})".format( + num_episodes, len(self._env.episodes) + ) + ) + + assert num_episodes > 0, "num_episodes should be greater than 0" + + agg_metrics: Dict = defaultdict(float) + + count_episodes = 0 + reward_episodes = 0 + step_episodes = 0 + success_count = 0 + for count_episodes in tqdm(range(num_episodes)): + agent.reset() + observations = self._env.reset() + episode_reward = 0 + + while not self._env.habitat_env.episode_over: + action = agent.act(observations) + observations, reward, done, info = self._env.step(**action) + logging.debug("Reward: {}".format(reward)) + if done: + logging.debug('Episode reward: {}'.format(episode_reward)) + episode_reward += reward + step_episodes += 1 + + metrics = self._env.habitat_env.get_metrics() + for m, v in metrics.items(): + agg_metrics[m] += v + reward_episodes += episode_reward + success_count += metrics['spl'] > 0 + + avg_metrics = {k: v / count_episodes for k, v in agg_metrics.items()} + logging.info("Average reward: {} in {} episodes".format(reward_episodes / count_episodes, count_episodes)) + logging.info("Average episode steps: {}".format(step_episodes / count_episodes)) + logging.info('Success rate: {}'.format(success_count / num_episodes)) + + return avg_metrics diff --git a/ss_baselines/common/env_utils.py b/ss_baselines/common/env_utils.py new file mode 100644 index 0000000..f2f492a --- /dev/null +++ b/ss_baselines/common/env_utils.py @@ -0,0 +1,137 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Type, Union +import logging +import copy +import random +import sys + +import numpy as np +import torch + +import habitat +from habitat import Config, Env, RLEnv, VectorEnv +from habitat.datasets import make_dataset +from ss_baselines.common.sync_vector_env import SyncVectorEnv + +REPLICA_SCENES = ['apartment_0', 'apartment_1', 'apartment_2', 'frl_apartment_0', 'frl_apartment_1', 'frl_apartment_2', + 'frl_apartment_3', 'frl_apartment_4', 'frl_apartment_5', 'office_0', 'office_1', 'office_2', + 'office_3', 'office_4', 'hotel_0', 'room_0', 'room_1', 'room_2'] + + +def construct_envs( + config: Config, env_class: Type[Union[Env, RLEnv]], auto_reset_done=True +) -> VectorEnv: + r"""Create VectorEnv object with specified config and env class type. + To allow better performance, dataset are split into small ones for + each individual env, grouped by scenes. + + Args: + config: configs that contain num_processes as well as information + necessary to create individual environments. + env_class: class type of the envs to be created + auto_reset_done: automatically reset environments when done + Returns: + VectorEnv object created according to specification. + """ + + num_processes = config.NUM_PROCESSES + configs = [] + env_classes = [env_class for _ in range(num_processes)] + + dataset = make_dataset(config.TASK_CONFIG.DATASET.TYPE) + scenes = dataset.get_scenes_to_load(config.TASK_CONFIG.DATASET) + + # rearrange scenes in the order of scene size since there is a severe imbalance of data size + if "replica" in config.TASK_CONFIG.DATASET.SCENES_DIR: + scenes_new = list() + for scene in REPLICA_SCENES: + if scene in scenes: + scenes_new.append(scene) + scenes = scenes_new + + if len(scenes) > 0: + # random.shuffle(scenes) + assert len(scenes) >= num_processes, ( + "reduce the number of processes as there " + "aren't enough number of scenes" + ) + + scene_splits = [[] for _ in range(num_processes)] + for idx, scene in enumerate(scenes): + scene_splits[idx % len(scene_splits)].append(scene) + + assert sum(map(len, scene_splits)) == len(scenes) + + for i in range(num_processes): + task_config = config.TASK_CONFIG.clone() + task_config.defrost() + if len(scenes) > 0: + task_config.DATASET.CONTENT_SCENES = scene_splits[i] + logging.debug('All scenes: {}'.format(','.join(scene_splits[i]))) + + # overwrite the task config with top-level config file + task_config.SIMULATOR.HABITAT_SIM_V0.GPU_DEVICE_ID = ( + config.SIMULATOR_GPU_ID + ) + task_config.SIMULATOR.AGENT_0.SENSORS = config.SENSORS + task_config.freeze() + + config.defrost() + config.TASK_CONFIG = task_config + config.freeze() + configs.append(config.clone()) + + # use VectorEnv for the best performance and ThreadedVectorEnv for debugging + if config.USE_SYNC_VECENV: + env_launcher = SyncVectorEnv + logging.info('Using SyncVectorEnv') + elif config.USE_VECENV: + env_launcher = habitat.VectorEnv + logging.info('Using VectorEnv') + else: + env_launcher = habitat.ThreadedVectorEnv + logging.info('Using ThreadedVectorEnv') + + envs = env_launcher( + make_env_fn=make_env_fn, + env_fn_args=tuple( + tuple(zip(configs, env_classes, range(num_processes)))), + auto_reset_done=auto_reset_done + ) + return envs + + +def make_env_fn( + config: Config, env_class: Type[Union[Env, RLEnv]], rank: int +) -> Union[Env, RLEnv]: + r"""Creates an env of type env_class with specified config and rank. + This is to be passed in as an argument when creating VectorEnv. + Args: + config: root exp config that has core env config node as well as + env-specific config node. + env_class: class type of the env to be created. + rank: rank of env to be created (for seeding). + Returns: + env object created according to specification. + """ + if not config.USE_SYNC_VECENV: + level = logging.DEBUG if config.DEBUG else logging.INFO + logging.basicConfig(level=level, format='%(asctime)s, %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S") + random.seed(rank) + np.random.seed(rank) + torch.manual_seed(rank) + + dataset = make_dataset( + config.TASK_CONFIG.DATASET.TYPE, config=config.TASK_CONFIG.DATASET + ) + env = env_class(config=config, dataset=dataset) + env.seed(rank) + return env diff --git a/ss_baselines/common/environments.py b/ss_baselines/common/environments.py new file mode 100644 index 0000000..60639f6 --- /dev/null +++ b/ss_baselines/common/environments.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +r""" +This file hosts task-specific or trainer-specific environments for trainers. +All environments here should be a (direct or indirect ) subclass of Env class +in habitat. Customized environments should be registered using +``@baseline_registry.register_env(name="myEnv")` for reusability +""" + +from typing import Optional, Type +import logging +import sys +import numpy as np +import math + +import habitat +from habitat import Config, Dataset +from ss_baselines.common.baseline_registry import baseline_registry + + +def get_env_class(env_name: str) -> Type[habitat.RLEnv]: + r"""Return environment class based on name. + Args: + env_name: name of the environment. + Returns: + Type[habitat.RLEnv]: env class. + """ + return baseline_registry.get_env(env_name) + + +@baseline_registry.register_env(name="AudioNavRLEnv") +class AudioNavRLEnv(habitat.RLEnv): + def __init__(self, config: Config, dataset: Optional[Dataset] = None): + self._rl_config = config.RL + self._core_env_config = config.TASK_CONFIG + + self._previous_target_distance = None + self._previous_action = None + self._new_episode = True + self._episode_distance_covered = None + self._success_distance = self._core_env_config.TASK.SUCCESS_DISTANCE + # ------ + self.query_num = None + self.env_idx = None + self.is_queried = None + super().__init__(self._core_env_config, dataset) + + def reset(self): + self._previous_action = None + self._new_episode = True + # ------ + self.query_num = 0 + self.is_queried = False + self.env_idx = None + observations = super().reset() + logging.debug(super().current_episode) + + self._previous_target_distance = self.habitat_env.current_episode.info[ + "geodesic_distance" + ] + return observations + + #------ + + def set_query_num(self, query_num): + self.query_num = query_num + + def set_idx(self, env_idx): + self.env_idx = env_idx + + def set_is_queried(self, is_queried): + self.is_queried = is_queried + + def set_constraint_reward(self, cons_reward): + self.cons_reward = cons_reward + + def compute_oracle_actions(self): + return self._env.sim.compute_oracle_actions() + + def step(self, *args, **kwargs): + self._new_episode = False + self._previous_action = kwargs["action"] + return super().step(*args, **kwargs) + + def get_reward_range(self): + return ( + self._rl_config.SLACK_REWARD - 1.0, + self._rl_config.SUCCESS_REWARD + 1.0, + ) + + def get_reward(self, observations): + reward = 0 + + if self._rl_config.WITH_TIME_PENALTY: + reward += self._rl_config.SLACK_REWARD + + if self._rl_config.WITH_DISTANCE_REWARD: + current_target_distance = self._distance_target() + # if current_target_distance < self._previous_target_distance: + reward += (self._previous_target_distance - current_target_distance) * self._rl_config.DISTANCE_REWARD_SCALE + self._previous_target_distance = current_target_distance + + if self._episode_success(): + reward += self._rl_config.SUCCESS_REWARD + logging.debug('Reaching goal!') + + if self._rl_config.WITH_QUERY_CONSTRAINT and self.is_queried: + if self.query_num<=self._rl_config.NUM_TOTAL_QUERY: + if self._rl_config.SOFT_QUERY_REWARD: + # reward += (self.query_num/self._rl_config.NUM_TOTAL_QUERY)*(max(self._rl_config.QUERY_REWARD/2, self._rl_config.SOFT_QUERY_REWARD_MAX)) + # taking max as negative value + reward += (self.query_num/self._rl_config.NUM_TOTAL_QUERY)*(math.exp(-self._rl_config.NUM_TOTAL_QUERY)+self._rl_config.QUERY_REWARD) + else: + reward += math.exp(-self.query_num)+self._rl_config.QUERY_REWARD + + if self._rl_config.CONSECUTIVE_CONSTRAINT_REWARD: + reward += self.cons_reward + + if self._rl_config.WITH_DISTANCE_CONSTRAINT and self.is_queried: + if self._rl_config.DISTANCE_DISTRIBUTION_TYPE=='gaussian': + samp_val = np.random.normal(self._rl_config.MEAN,self._rl_config.SD, 1)[0] + if self._rl_config.DISTANCE_DISTRIBUTION_TYPE=='beta': + samp_val = np.random.beta(self._rl_config.ALPHA, self._rl_config.BETA, 1)[0] + if current_target_distance*samp_val <=3: + reward += self._rl_config.QUERY_REWARD_DISTANCE + + + return reward + + # ------------------------------ + def agent_state(self): + position = self._env.sim.get_agent_state().position.tolist() + rotation = self._env.sim.get_agent_state().rotation.tolist() + receiver_node = self._env.sim._receiver_position_index + source_node = self._env.sim._source_position_index + scene = self._env.sim._current_scene.split('/')[-2] + view = self._env.sim._node2view[scene][str(receiver_node)] + # dialog pretraining + sub_instr = self._env.sim._sub_instr + current_target_distance = self._distance_target() + + appro_next_points = [] + if receiver_node in self._env.sim.paths.keys(): + if source_node in self._env.sim.paths[receiver_node].keys(): + gt_next_points = self._env.sim.paths[receiver_node][source_node][:4] + + for point in gt_next_points: + appro_next_points.append(self._env.sim._node2view[scene][str(point)]) + + return position, rotation, scene, receiver_node, view, appro_next_points, sub_instr, current_target_distance + + def _distance_target(self): + current_position = self._env.sim.get_agent_state().position.tolist() + target_positions = [goal.position for goal in self._env.current_episode.goals] + distance = self._env.sim.geodesic_distance( + current_position, target_positions + ) + return distance + + def _episode_success(self): + if ( + self._env.task.is_stop_called + # and self._distance_target() < self._success_distance + and self._env.sim.reaching_goal + ): + return True + return False + + def get_done(self, observations): + done = False + if self._env.episode_over or self._episode_success(): + done = True + return done + + def get_info(self, observations): + return self.habitat_env.get_metrics() + + # for data collection + def get_current_episode_id(self): + return self.habitat_env.current_episode.episode_id diff --git a/ss_baselines/common/rollout_storage.py b/ss_baselines/common/rollout_storage.py new file mode 100644 index 0000000..d374fe2 --- /dev/null +++ b/ss_baselines/common/rollout_storage.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from collections import defaultdict + +import torch + + +class RolloutStorage: + r"""Class for storing rollout information for RL trainers. + + """ + + def __init__( + self, + num_steps, + num_envs, + observation_space, + action_space, + recurrent_hidden_state_size, + num_recurrent_layers=1, + ): + self.observations = {} + + for sensor in observation_space.spaces: + self.observations[sensor] = torch.zeros( + num_steps + 1, + num_envs, + *observation_space.spaces[sensor].shape + ) + + self.recurrent_hidden_states = torch.zeros( + num_steps + 1, + num_recurrent_layers, + num_envs, + recurrent_hidden_state_size, + ) + + self.rewards = torch.zeros(num_steps, num_envs, 1) + self.value_preds = torch.zeros(num_steps + 1, num_envs, 1) + self.returns = torch.zeros(num_steps + 1, num_envs, 1) + + self.action_log_probs = torch.zeros(num_steps, num_envs, 1) + if action_space.__class__.__name__ == "ActionSpace": + action_shape = 1 + else: + action_shape = action_space.shape[0] + + self.actions = torch.zeros(num_steps, num_envs, action_shape) + self.prev_actions = torch.zeros(num_steps + 1, num_envs, action_shape) + if action_space.__class__.__name__ == "ActionSpace": + self.actions = self.actions.long() + self.prev_actions = self.prev_actions.long() + + self.masks = torch.ones(num_steps + 1, num_envs, 1) + + self.num_steps = num_steps + self.step = 0 + + def to(self, device): + for sensor in self.observations: + self.observations[sensor] = self.observations[sensor].to(device) + + self.recurrent_hidden_states = self.recurrent_hidden_states.to(device) + self.rewards = self.rewards.to(device) + self.value_preds = self.value_preds.to(device) + self.returns = self.returns.to(device) + self.action_log_probs = self.action_log_probs.to(device) + self.actions = self.actions.to(device) + self.prev_actions = self.prev_actions.to(device) + self.masks = self.masks.to(device) + + def insert( + self, + observations, + recurrent_hidden_states, + actions, + action_log_probs, + value_preds, + rewards, + masks, + ): + for sensor in observations: + self.observations[sensor][self.step + 1].copy_( + observations[sensor] + ) + self.recurrent_hidden_states[self.step + 1].copy_( + recurrent_hidden_states + ) + self.actions[self.step].copy_(actions) + self.prev_actions[self.step + 1].copy_(actions) + self.action_log_probs[self.step].copy_(action_log_probs) + self.value_preds[self.step].copy_(value_preds) + self.rewards[self.step].copy_(rewards) + self.masks[self.step + 1].copy_(masks) + + self.step = (self.step + 1) % self.num_steps + + def after_update(self): + for sensor in self.observations: + self.observations[sensor][0].copy_(self.observations[sensor][-1]) + + self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1]) + self.masks[0].copy_(self.masks[-1]) + self.prev_actions[0].copy_(self.prev_actions[-1]) + + def compute_returns(self, next_value, use_gae, gamma, tau): + if use_gae: + self.value_preds[-1] = next_value + gae = 0 + for step in reversed(range(self.rewards.size(0))): + delta = ( + self.rewards[step] + + gamma * self.value_preds[step + 1] * self.masks[step + 1] + - self.value_preds[step] + ) + gae = delta + gamma * tau * self.masks[step + 1] * gae + self.returns[step] = gae + self.value_preds[step] + else: + self.returns[-1] = next_value + for step in reversed(range(self.rewards.size(0))): + self.returns[step] = ( + self.returns[step + 1] * gamma * self.masks[step + 1] + + self.rewards[step] + ) + + def recurrent_generator(self, advantages, num_mini_batch): + num_processes = self.rewards.size(1) + assert num_processes >= num_mini_batch, ( + "Trainer requires the number of processes ({}) " + "to be greater than or equal to the number of " + "trainer mini batches ({}).".format(num_processes, num_mini_batch) + ) + num_envs_per_batch = num_processes // num_mini_batch + perm = torch.randperm(num_processes) + for start_ind in range(0, num_processes, num_envs_per_batch): + observations_batch = defaultdict(list) + + recurrent_hidden_states_batch = [] + actions_batch = [] + prev_actions_batch = [] + value_preds_batch = [] + return_batch = [] + masks_batch = [] + old_action_log_probs_batch = [] + adv_targ = [] + + for offset in range(num_envs_per_batch): + ind = perm[start_ind + offset] + + for sensor in self.observations: + observations_batch[sensor].append( + self.observations[sensor][:-1, ind] + ) + + recurrent_hidden_states_batch.append( + self.recurrent_hidden_states[0, :, ind] + ) + + actions_batch.append(self.actions[:, ind]) + prev_actions_batch.append(self.prev_actions[:-1, ind]) + value_preds_batch.append(self.value_preds[:-1, ind]) + return_batch.append(self.returns[:-1, ind]) + masks_batch.append(self.masks[:-1, ind]) + old_action_log_probs_batch.append( + self.action_log_probs[:, ind] + ) + + adv_targ.append(advantages[:, ind]) + + T, N = self.num_steps, num_envs_per_batch + + # These are all tensors of size (T, N, -1) + for sensor in observations_batch: + observations_batch[sensor] = torch.stack( + observations_batch[sensor], 1 + ) + + actions_batch = torch.stack(actions_batch, 1) + prev_actions_batch = torch.stack(prev_actions_batch, 1) + value_preds_batch = torch.stack(value_preds_batch, 1) + return_batch = torch.stack(return_batch, 1) + masks_batch = torch.stack(masks_batch, 1) + old_action_log_probs_batch = torch.stack( + old_action_log_probs_batch, 1 + ) + adv_targ = torch.stack(adv_targ, 1) + + # States is just a (num_recurrent_layers, N, -1) tensor + recurrent_hidden_states_batch = torch.stack( + recurrent_hidden_states_batch, 1 + ) + + # Flatten the (T, N, ...) tensors to (T * N, ...) + for sensor in observations_batch: + observations_batch[sensor] = self._flatten_helper( + T, N, observations_batch[sensor] + ) + + actions_batch = self._flatten_helper(T, N, actions_batch) + prev_actions_batch = self._flatten_helper(T, N, prev_actions_batch) + value_preds_batch = self._flatten_helper(T, N, value_preds_batch) + return_batch = self._flatten_helper(T, N, return_batch) + masks_batch = self._flatten_helper(T, N, masks_batch) + old_action_log_probs_batch = self._flatten_helper( + T, N, old_action_log_probs_batch + ) + adv_targ = self._flatten_helper(T, N, adv_targ) + + yield ( + observations_batch, + recurrent_hidden_states_batch, + actions_batch, + prev_actions_batch, + value_preds_batch, + return_batch, + masks_batch, + old_action_log_probs_batch, + adv_targ, + ) + + @staticmethod + def _flatten_helper(t: int, n: int, tensor: torch.Tensor) -> torch.Tensor: + r"""Given a tensor of size (t, n, ..), flatten it to size (t*n, ...). + + Args: + t: first dimension of tensor. + n: second dimension of tensor. + tensor: target tensor to be flattened. + + Returns: + flattened tensor of size (t*n, ...) + """ + return tensor.view(t * n, *tensor.size()[2:]) diff --git a/ss_baselines/common/simple_agents.py b/ss_baselines/common/simple_agents.py new file mode 100644 index 0000000..8b2bd9e --- /dev/null +++ b/ss_baselines/common/simple_agents.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse +from math import pi +import logging + +import numpy as np + +import habitat +from habitat.sims.habitat_simulator.actions import HabitatSimActions +# from habitat.config.default import get_config +from av_wan.common.benchmark import Benchmark +from av_wan.config.default import get_task_config as get_config + + +class RandomAgent(habitat.Agent): + def __init__(self, success_distance, goal_sensor_uuid): + self.dist_threshold_to_stop = success_distance + self.goal_sensor_uuid = goal_sensor_uuid + + def reset(self): + pass + + def is_goal_reached(self, observations): + # because the frame is in with polar coordinates + dist = observations[self.goal_sensor_uuid][0] + return dist <= self.dist_threshold_to_stop + + def act(self, observations): + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + action = np.random.choice( + [ + HabitatSimActions.MOVE_FORWARD, + HabitatSimActions.TURN_LEFT, + HabitatSimActions.TURN_RIGHT, + ] + ) + return {"action": action} + + +class ForwardOnlyAgent(RandomAgent): + def act(self, observations): + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + action = HabitatSimActions.MOVE_FORWARD + return {"action": action} + + +class RandomForwardAgent(RandomAgent): + def __init__(self, success_distance, goal_sensor_uuid): + super().__init__(success_distance, goal_sensor_uuid) + self.FORWARD_PROBABILITY = 0.8 + + def act(self, observations): + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + if np.random.uniform(0, 1, 1) < self.FORWARD_PROBABILITY: + action = HabitatSimActions.MOVE_FORWARD + else: + action = np.random.choice( + [HabitatSimActions.TURN_LEFT, HabitatSimActions.TURN_RIGHT] + ) + + return {"action": action} + + +class GoalFollower(RandomAgent): + def __init__(self, success_distance, goal_sensor_uuid): + super().__init__(success_distance, goal_sensor_uuid) + self.pos_th = self.dist_threshold_to_stop + self.angle_th = float(np.deg2rad(15)) + self.random_prob = 0 + + def normalize_angle(self, angle): + if angle < -pi: + angle = 2.0 * pi + angle + if angle > pi: + angle = -2.0 * pi + angle + return angle + + def turn_towards_goal(self, angle_to_goal): + if angle_to_goal > pi or ( + (angle_to_goal < 0) and (angle_to_goal > -pi) + ): + action = HabitatSimActions.TURN_RIGHT + else: + action = HabitatSimActions.TURN_LEFT + return action + + def act(self, observations): + if self.is_goal_reached(observations): + action = HabitatSimActions.STOP + else: + angle_to_goal = self.normalize_angle( + np.array(observations[self.goal_sensor_uuid][1]) + ) + if abs(angle_to_goal) < self.angle_th: + action = HabitatSimActions.MOVE_FORWARD + else: + action = self.turn_towards_goal(angle_to_goal) + + return {"action": action} + + +def get_all_subclasses(cls): + return set(cls.__subclasses__()).union( + [s for c in cls.__subclasses__() for s in get_all_subclasses(c)] + ) + + +def get_agent_cls(agent_class_name): + sub_classes = [ + sub_class + for sub_class in get_all_subclasses(habitat.Agent) + if sub_class.__name__ == agent_class_name + ] + return sub_classes[0] + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--success-distance", type=float, default=0.2) + parser.add_argument( + "--task-config", type=str, default="configs/tasks/pointnav.yaml" + ) + parser.add_argument("--agent-class", type=str, default="RandomAgent") + parser.add_argument("--debug", default=False, action="store_true") + args = parser.parse_args() + + level = logging.DEBUG if args.debug else logging.INFO + logging.basicConfig(level=level, format='%(asctime)s, %(levelname)s: %(message)s', + datefmt="%Y-%m-%d %H:%M:%S") + + task_config = get_config(args.task_config) + task_config.defrost() + task_config.DATASET.SPLIT = 'test_telephone' + task_config.freeze() + + agent = get_agent_cls(args.agent_class)( + success_distance=args.success_distance, + goal_sensor_uuid=task_config.TASK.GOAL_SENSOR_UUID, + ) + benchmark = Benchmark(task_config) + metrics = benchmark.evaluate(agent) + + for k, v in metrics.items(): + habitat.logger.info("{}: {:.3f}".format(k, v)) + + +if __name__ == "__main__": + main() diff --git a/ss_baselines/common/sync_vector_env.py b/ss_baselines/common/sync_vector_env.py new file mode 100644 index 0000000..fac7e77 --- /dev/null +++ b/ss_baselines/common/sync_vector_env.py @@ -0,0 +1,586 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +from multiprocessing.connection import Connection +from multiprocessing.context import BaseContext +from queue import Queue +from threading import Thread +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Set, + Tuple, + Union, +) + +import gym +import numpy as np +from gym.spaces import Dict as SpaceDict + +import habitat +from habitat.config import Config +from habitat.core.env import Env, Observations, RLEnv +from habitat.core.logging import logger +from habitat.core.utils import tile_images + +try: + # Use torch.multiprocessing if we can. + # We have yet to find a reason to not use it and + # you are required to use it when sending a torch.Tensor + # between processes + import torch.multiprocessing as mp +except ImportError: + import multiprocessing as mp + +STEP_COMMAND = "step" +RESET_COMMAND = "reset" +RENDER_COMMAND = "render" +CLOSE_COMMAND = "close" +OBSERVATION_SPACE_COMMAND = "observation_space" +ACTION_SPACE_COMMAND = "action_space" +CALL_COMMAND = "call" +EPISODE_COMMAND = "current_episode" + +# ---------- +STATE_COMMAND = 'state' +IS_NEW_EPISODE_COMMAND = 'is_new_episode' +ORACLE_ACTION_COMMAND = 'o_action' + + +def _make_env_fn( + config: Config, dataset: Optional[habitat.Dataset] = None, rank: int = 0 +) -> Env: + """Constructor for default habitat `env.Env`. + + :param config: configuration for environment. + :param dataset: dataset for environment. + :param rank: rank for setting seed of environment + :return: `env.Env` / `env.RLEnv` object + """ + habitat_env = Env(config=config, dataset=dataset) + habitat_env.seed(config.SEED + rank) + return habitat_env + + +class WorkerEnv: + def __init__(self, env_fn, env_fn_arg, auto_reset_done): + self._env = env_fn(*env_fn_arg) + self._auto_reset_done = auto_reset_done + + def __call__(self, command, data): + while command != CLOSE_COMMAND: + if command == STEP_COMMAND: + # different step methods for habitat.RLEnv and habitat.Env + if isinstance(self._env, habitat.RLEnv) or isinstance( + self._env, gym.Env + ): + # habitat.RLEnv + observations, reward, done, info = self._env.step(**data) + if self._auto_reset_done and done: + observations = self._env.reset() + return observations, reward, done, info + elif isinstance(self._env, habitat.Env): + # habitat.Env + observations = self._env.step(**data) + if self._auto_reset_done and self._env.episode_over: + observations = self._env.reset() + return observations + else: + raise NotImplementedError + + elif command == RESET_COMMAND: + observations = self._env.reset() + return observations + + elif command == RENDER_COMMAND: + return self._env.render(*data[0], **data[1]) + + elif ( + command == OBSERVATION_SPACE_COMMAND + or command == ACTION_SPACE_COMMAND + ): + if isinstance(command, str): + return getattr(self._env, command) + + elif command == CALL_COMMAND: + function_name, function_args = data + if function_args is None or len(function_args) == 0: + result = getattr(self._env, function_name)() + else: + result = getattr(self._env, function_name)(**function_args) + return result + + # TODO: update CALL_COMMAND for getting attribute like this + elif command == EPISODE_COMMAND: + return self._env.current_episode + + # -------------------------- + elif command == STATE_COMMAND: + return self._env.agent_state() + + elif command == IS_NEW_EPISODE_COMMAND: + return self._env._new_episode + + elif command == ORACLE_ACTION_COMMAND: + return self._env.compute_oracle_actions() + + + else: + raise NotImplementedError + + + + +class SyncVectorEnv: + r"""Vectorized environment which creates multiple processes where each + process runs its own environment. Main class for parallelization of + training and evaluation. + + + All the environments are synchronized on step and reset methods. + """ + + observation_spaces: List[SpaceDict] + action_spaces: List[SpaceDict] + _workers: List[Union[mp.Process, Thread]] + _is_waiting: bool + _num_envs: int + _auto_reset_done: bool + _mp_ctx: BaseContext + _connection_read_fns: List[Callable[[], Any]] + _connection_write_fns: List[Callable[[Any], None]] + + def __init__( + self, + make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn, + env_fn_args: Sequence[Tuple] = None, + auto_reset_done: bool = True, + multiprocessing_start_method: str = "forkserver", + ) -> None: + """.. + + :param make_env_fn: function which creates a single environment. An + environment can be of type `env.Env` or `env.RLEnv` + :param env_fn_args: tuple of tuple of args to pass to the + `_make_env_fn`. + :param auto_reset_done: automatically reset the environment when + done. This functionality is provided for seamless training + of vectorized environments. + :param multiprocessing_start_method: the multiprocessing method used to + spawn worker processes. Valid methods are + :py:`{'spawn', 'forkserver', 'fork'}`; :py:`'forkserver'` is the + recommended method as it works well with CUDA. If :py:`'fork'` is + used, the subproccess must be started before any other GPU useage. + """ + self._is_waiting = False + self._is_closed = True + + # assert ( + # env_fn_args is not None and len(env_fn_args) > 0 + # ), "number of environments to be created should be greater than 0" + + self._num_envs = len(env_fn_args) + + # assert multiprocessing_start_method in self._valid_start_methods, ( + # "multiprocessing_start_method must be one of {}. Got '{}'" + # ).format(self._valid_start_methods, multiprocessing_start_method) + self._auto_reset_done = auto_reset_done + # self._mp_ctx = mp.get_context(multiprocessing_start_method) + # self._workers = [] + # ( + # self._connection_read_fns, + # self._connection_write_fns, + # ) = self._spawn_workers( # noqa + # env_fn_args, make_env_fn + # ) + self.workers = [] + for env_fn_arg in env_fn_args: + worker = WorkerEnv(make_env_fn, env_fn_arg, auto_reset_done=auto_reset_done) + self.workers.append(worker) + + # self._is_closed = False + + # for write_fn in self._connection_write_fns: + # write_fn((OBSERVATION_SPACE_COMMAND, None)) + # self.observation_spaces = [ + # read_fn() for read_fn in self._connection_read_fns + # ] + self.observation_spaces = [worker(OBSERVATION_SPACE_COMMAND, None) for worker in self.workers] + self.action_spaces = [worker(ACTION_SPACE_COMMAND, None) for worker in self.workers] + # for write_fn in self._connection_write_fns: + # write_fn((ACTION_SPACE_COMMAND, None)) + # self.action_spaces = [ + # read_fn() for read_fn in self._connection_read_fns + # ] + self._paused = [] + + @property + def num_envs(self): + r"""number of individual environments. + """ + return self._num_envs - len(self._paused) + + # @staticmethod + # def _worker_env( + # connection_read_fn: Callable, + # connection_write_fn: Callable, + # env_fn: Callable, + # env_fn_args: Tuple[Any], + # auto_reset_done: bool, + # child_pipe: Optional[Connection] = None, + # parent_pipe: Optional[Connection] = None, + # ) -> None: + # r"""process worker for creating and interacting with the environment. + # """ + # env = env_fn(*env_fn_args) + # if parent_pipe is not None: + # parent_pipe.close() + # try: + # command, data = connection_read_fn() + # while command != CLOSE_COMMAND: + # if command == STEP_COMMAND: + # # different step methods for habitat.RLEnv and habitat.Env + # if isinstance(env, habitat.RLEnv) or isinstance( + # env, gym.Env + # ): + # # habitat.RLEnv + # observations, reward, done, info = env.step(**data) + # if auto_reset_done and done: + # observations = env.reset() + # connection_write_fn((observations, reward, done, info)) + # elif isinstance(env, habitat.Env): + # # habitat.Env + # observations = env.step(**data) + # if auto_reset_done and env.episode_over: + # observations = env.reset() + # connection_write_fn(observations) + # else: + # raise NotImplementedError + # + # elif command == RESET_COMMAND: + # observations = env.reset() + # connection_write_fn(observations) + # + # elif command == RENDER_COMMAND: + # connection_write_fn(env.render(*data[0], **data[1])) + # + # elif ( + # command == OBSERVATION_SPACE_COMMAND + # or command == ACTION_SPACE_COMMAND + # ): + # if isinstance(command, str): + # connection_write_fn(getattr(env, command)) + # + # elif command == CALL_COMMAND: + # function_name, function_args = data + # if function_args is None or len(function_args) == 0: + # result = getattr(env, function_name)() + # else: + # result = getattr(env, function_name)(**function_args) + # connection_write_fn(result) + # + # # TODO: update CALL_COMMAND for getting attribute like this + # elif command == EPISODE_COMMAND: + # connection_write_fn(env.current_episode) + # else: + # raise NotImplementedError + # + # command, data = connection_read_fn() + # + # if child_pipe is not None: + # child_pipe.close() + # except KeyboardInterrupt: + # logger.info("Worker KeyboardInterrupt") + # finally: + # env.close() + # + # def _spawn_workers( + # self, + # env_fn_args: Sequence[Tuple], + # make_env_fn: Callable[..., Union[Env, RLEnv]] = _make_env_fn, + # ) -> Tuple[List[Callable[[], Any]], List[Callable[[Any], None]]]: + # parent_connections, worker_connections = zip( + # *[self._mp_ctx.Pipe(duplex=True) for _ in range(self._num_envs)] + # ) + # self._workers = [] + # for worker_conn, parent_conn, env_args in zip( + # worker_connections, parent_connections, env_fn_args + # ): + # ps = self._mp_ctx.Process( + # target=self._worker_env, + # args=( + # worker_conn.recv, + # worker_conn.send, + # make_env_fn, + # env_args, + # self._auto_reset_done, + # worker_conn, + # parent_conn, + # ), + # ) + # self._workers.append(ps) + # ps.daemon = True + # ps.start() + # worker_conn.close() + # return ( + # [p.recv for p in parent_connections], + # [p.send for p in parent_connections], + # ) + + def agent_state(self): + results = [worker(STATE_COMMAND, None) for worker in self.workers] + return results + + def is_new_episode(self): + results = [worker(IS_NEW_EPISODE_COMMAND, None) for worker in self.workers] + return results + + def compute_oracle_actions(self): + results = [worker(ORACLE_ACTION_COMMAND, None) for worker in self.workers] + return results + + def current_episodes(self): + # self._is_waiting = True + # for write_fn in self._connection_write_fns: + # write_fn((EPISODE_COMMAND, None)) + # results = [] + # for read_fn in self._connection_read_fns: + # results.append(read_fn()) + # self._is_waiting = False + results = [worker(EPISODE_COMMAND, None) for worker in self.workers] + return results + + def reset(self): + r"""Reset all the vectorized environments + + :return: list of outputs from the reset method of envs. + """ + # self._is_waiting = True + # for write_fn in self._connection_write_fns: + # write_fn((RESET_COMMAND, None)) + # results = [] + # for read_fn in self._connection_read_fns: + # results.append(read_fn()) + # self._is_waiting = False + results = [worker(RESET_COMMAND, None) for worker in self.workers] + return results + + def reset_at(self, index_env: int): + r"""Reset in the index_env environment in the vector. + + :param index_env: index of the environment to be reset + :return: list containing the output of reset method of indexed env. + """ + # self._is_waiting = True + # self._connection_write_fns[index_env]((RESET_COMMAND, None)) + # results = [self._connection_read_fns[index_env]()] + # self._is_waiting = False + results = [self.workers[index_env](RESET_COMMAND, None)] + return results + + def step_at(self, index_env: int, action: Dict[str, Any]): + r"""Step in the index_env environment in the vector. + + :param index_env: index of the environment to be stepped into + :param action: action to be taken + :return: list containing the output of step method of indexed env. + """ + # self._is_waiting = True + # self._connection_write_fns[index_env]((STEP_COMMAND, action)) + # results = [self._connection_read_fns[index_env]()] + # self._is_waiting = False + results = [self.workers[index_env](STEP_COMMAND, action)] + return results + + def async_step(self, data: List[Union[int, str, Dict[str, Any]]]) -> None: + r"""Asynchronously step in the environments. + + :param data: list of size _num_envs containing keyword arguments to + pass to `step` method for each Environment. For example, + :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`. + """ + # Backward compatibility + if isinstance(data[0], (int, np.integer, str)): + data = [{"action": {"action": action}} for action in data] + + self._is_waiting = True + for write_fn, args in zip(self._connection_write_fns, data): + write_fn((STEP_COMMAND, args)) + + def wait_step(self) -> List[Observations]: + r"""Wait until all the asynchronized environments have synchronized. + """ + observations = [] + for read_fn in self._connection_read_fns: + observations.append(read_fn()) + self._is_waiting = False + return observations + + def step(self, data: List[Union[int, str, Dict[str, Any]]]) -> List[Any]: + r"""Perform actions in the vectorized environments. + + :param data: list of size _num_envs containing keyword arguments to + pass to `step` method for each Environment. For example, + :py:`[{"action": "TURN_LEFT", "action_args": {...}}, ...]`. + :return: list of outputs from the step method of envs. + """ + if isinstance(data[0], (int, np.integer, str)): + data = [{"action": {"action": action}} for action in data] + # self.async_step(data) + # return self.wait_step() + results = [worker(STEP_COMMAND, args) for worker, args in zip(self.workers, data)] + return results + + def close(self) -> None: + if self._is_closed: + return + + # if self._is_waiting: + # for read_fn in self._connection_read_fns: + # read_fn() + # + # for write_fn in self._connection_write_fns: + # write_fn((CLOSE_COMMAND, None)) + # + # for _, _, write_fn, _ in self._paused: + # write_fn((CLOSE_COMMAND, None)) + # + # for process in self._workers: + # process.join() + # + # for _, _, _, process in self._paused: + # process.join() + for worker in self.workers: + worker(CLOSE_COMMAND, None) + + self._is_closed = True + + def pause_at(self, index: int) -> None: + r"""Pauses computation on this env without destroying the env. + + :param index: which env to pause. All indexes after this one will be + shifted down by one. + + This is useful for not needing to call steps on all environments when + only some are active (for example during the last episodes of running + eval episodes). + """ + # if self._is_waiting: + # for read_fn in self._connection_read_fns: + # read_fn() + # read_fn = self._connection_read_fns.pop(index) + # write_fn = self._connection_write_fns.pop(index) + # worker = self._workers.pop(index) + # self._paused.append((index, read_fn, write_fn, worker)) + worker = self.workers.pop(index) + self._paused.append((index, worker)) + + def resume_all(self) -> None: + r"""Resumes any paused envs. + """ + # for index, read_fn, write_fn, worker in reversed(self._paused): + # self._connection_read_fns.insert(index, read_fn) + # self._connection_write_fns.insert(index, write_fn) + # self._workers.insert(index, worker) + # self._paused = [] + for index, worker in reversed(self._paused): + self.workers.insert(index, worker) + self._paused = [] + + def call_at( + self, + index: int, + function_name: str, + function_args: Optional[Dict[str, Any]] = None, + ) -> Any: + r"""Calls a function (which is passed by name) on the selected env and + returns the result. + + :param index: which env to call the function on. + :param function_name: the name of the function to call on the env. + :param function_args: optional function args. + :return: result of calling the function. + """ + # getattr(foo, 'bar')() + result = getattr(self.workers[index]._env, function_name)(**function_args) + # self._is_waiting = True + # self._connection_write_fns[index]( + # (CALL_COMMAND, (function_name, function_args)) + # ) + # result = self._connection_read_fns[index]() + # self._is_waiting = False + return result + + def call( + self, + function_names: List[str], + function_args_list: Optional[List[Any]] = None, + ) -> List[Any]: + r"""Calls a list of functions (which are passed by name) on the + corresponding env (by index). + + :param function_names: the name of the functions to call on the envs. + :param function_args_list: list of function args for each function. If + provided, :py:`len(function_args_list)` should be as long as + :py:`len(function_names)`. + :return: result of calling the function. + """ + self._is_waiting = True + if function_args_list is None: + function_args_list = [None] * len(function_names) + assert len(function_names) == len(function_args_list) + func_args = zip(function_names, function_args_list) + for write_fn, func_args_on in zip( + self._connection_write_fns, func_args + ): + write_fn((CALL_COMMAND, func_args_on)) + results = [] + for read_fn in self._connection_read_fns: + results.append(read_fn()) + self._is_waiting = False + return results + + def render( + self, mode: str = "human", *args, **kwargs + ) -> Union[np.ndarray, None]: + r"""Render observations from all environments in a tiled image. + """ + for write_fn in self._connection_write_fns: + write_fn((RENDER_COMMAND, (args, {"mode": "rgb", **kwargs}))) + images = [read_fn() for read_fn in self._connection_read_fns] + tile = tile_images(images) + if mode == "human": + from habitat.core.utils import try_cv2_import + + cv2 = try_cv2_import() + + cv2.imshow("vecenv", tile[:, :, ::-1]) + cv2.waitKey(1) + return None + elif mode == "rgb_array": + return tile + else: + raise NotImplementedError + + @property + def _valid_start_methods(self) -> Set[str]: + return {"forkserver", "spawn", "fork"} + + def __del__(self): + self.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() diff --git a/ss_baselines/common/tensorboard_utils.py b/ss_baselines/common/tensorboard_utils.py new file mode 100644 index 0000000..4c8a4b0 --- /dev/null +++ b/ss_baselines/common/tensorboard_utils.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any + +import numpy as np +import torch +from torch.utils.tensorboard import SummaryWriter + + +class TensorboardWriter: + def __init__(self, log_dir: str, *args: Any, **kwargs: Any): + r"""A Wrapper for tensorboard SummaryWriter. It creates a dummy writer + when log_dir is empty string or None. It also has functionality that + generates tb video directly from numpy images. + + Args: + log_dir: Save directory location. Will not write to disk if + log_dir is an empty string. + *args: Additional positional args for SummaryWriter + **kwargs: Additional keyword args for SummaryWriter + """ + self.writer = None + if log_dir is not None and len(log_dir) > 0: + self.writer = SummaryWriter(log_dir, *args, **kwargs) + + def __getattr__(self, item): + if self.writer: + return self.writer.__getattribute__(item) + else: + return lambda *args, **kwargs: None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.writer: + self.writer.close() + + def add_video_from_np_images( + self, video_name: str, step_idx: int, images: np.ndarray, fps: int = 10 + ) -> None: + r"""Write video into tensorboard from images frames. + + Args: + video_name: name of video string. + step_idx: int of checkpoint index to be displayed. + images: list of n frames. Each frame is a np.ndarray of shape. + fps: frame per second for output video. + + Returns: + None. + """ + if not self.writer: + return + # initial shape of np.ndarray list: N * (H, W, 3) + frame_tensors = [ + torch.from_numpy(np_arr).unsqueeze(0) for np_arr in images + ] + video_tensor = torch.cat(tuple(frame_tensors)) + video_tensor = video_tensor.permute(0, 3, 1, 2).unsqueeze(0) + # final shape of video tensor: (1, n, 3, H, W) + self.writer.add_video( + video_name, video_tensor, fps=fps, global_step=step_idx + ) diff --git a/ss_baselines/common/utils.py b/ss_baselines/common/utils.py new file mode 100644 index 0000000..c6a6911 --- /dev/null +++ b/ss_baselines/common/utils.py @@ -0,0 +1,727 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +import glob +import os +from collections import defaultdict +from typing import Dict, List, Optional +import random +import copy +import numbers +import json +import sys + +import numpy as np +import cv2 +from scipy.io import wavfile +import torch +import torch.nn as nn +import torch.nn.functional as f +import moviepy.editor as mpy +from gym.spaces import Box +from moviepy.audio.AudioClip import CompositeAudioClip + +from habitat.utils.visualizations.utils import images_to_video +from habitat import logger +from habitat_sim.utils.common import d3_40_colors_rgb +from ss_baselines.common.tensorboard_utils import TensorboardWriter +from habitat.utils.visualizations import maps +from habitat.utils.visualizations.utils import draw_collision +from habitat_sim.utils.common import quat_to_angle_axis, quat_to_coeffs, quat_from_angle_axis, quat_from_coeffs + + +class Flatten(nn.Module): + def forward(self, x): + return x.reshape(x.size(0), -1) + + +class CustomFixedCategorical(torch.distributions.Categorical): + def sample(self, sample_shape=torch.Size()): + return super().sample(sample_shape).unsqueeze(-1) + + def log_probs(self, actions): + return ( + super() + .log_prob(actions.squeeze(-1)) + .view(actions.size(0), -1) + .sum(-1) + .unsqueeze(-1) + ) + + def mode(self): + return self.probs.argmax(dim=-1, keepdim=True) + + +class CategoricalNet(nn.Module): + def __init__(self, num_inputs, num_outputs): + super().__init__() + + self.linear = nn.Linear(num_inputs, num_outputs) + + nn.init.orthogonal_(self.linear.weight, gain=0.01) + nn.init.constant_(self.linear.bias, 0) + + def forward(self, x): + x = self.linear(x) + return CustomFixedCategorical(logits=x), x + + +class CategoricalNetWithMask(nn.Module): + def __init__(self, num_inputs, num_outputs, masking): + super().__init__() + self.masking = masking + + self.linear = nn.Linear(num_inputs, num_outputs) + + nn.init.orthogonal_(self.linear.weight, gain=0.01) + nn.init.constant_(self.linear.bias, 0) + + def forward(self, features, action_maps): + probs = f.softmax(self.linear(features)) + if self.masking: + probs = probs * torch.reshape(action_maps, (action_maps.shape[0], -1)).float() + + return CustomFixedCategorical(probs=probs) + + +def linear_decay(epoch: int, total_num_updates: int) -> float: + r"""Returns a multiplicative factor for linear value decay + + Args: + epoch: current epoch number + total_num_updates: total number of epochs + + Returns: + multiplicative factor that decreases param value linearly + """ + return 1 - (epoch / float(total_num_updates)) + + +def exponential_decay(epoch: int, total_num_updates: int, decay_lambda: float) -> float: + r"""Returns a multiplicative factor for linear value decay + + Args: + epoch: current epoch number + total_num_updates: total number of epochs + decay_lambda: decay lambda + + Returns: + multiplicative factor that decreases param value linearly + """ + return np.exp(-decay_lambda * (epoch / float(total_num_updates))) + + +def to_tensor(v): + if torch.is_tensor(v): + return v + elif isinstance(v, np.ndarray): + return torch.from_numpy(v) + else: + return torch.tensor(v, dtype=torch.float) + + +def batch_obs( + observations: List[Dict], device: Optional[torch.device] = None, skip_list = [] +) -> Dict[str, torch.Tensor]: + r"""Transpose a batch of observation dicts to a dict of batched + observations. + + Args: + observations: list of dicts of observations. + device: The torch.device to put the resulting tensors on. + Will not move the tensors if None + + Returns: + transposed dict of lists of observations. + """ + batch = defaultdict(list) + + for obs in observations: + for sensor in obs: + if sensor in skip_list: + continue + batch[sensor].append(to_tensor(obs[sensor]).float()) + + for sensor in batch: + batch[sensor] = torch.stack(batch[sensor], dim=0).to( + device=device, dtype=torch.float + ) + + return batch + + +def poll_checkpoint_folder( + checkpoint_folder: str, previous_ckpt_ind: int, eval_interval: int +) -> Optional[str]: + r""" Return (previous_ckpt_ind + 1)th checkpoint in checkpoint folder + (sorted by time of last modification). + + Args: + checkpoint_folder: directory to look for checkpoints. + previous_ckpt_ind: index of checkpoint last returned. + eval_interval: number of checkpoints between two evaluation + + Returns: + return checkpoint path if (previous_ckpt_ind + 1)th checkpoint is found + else return None. + """ + assert os.path.isdir(checkpoint_folder), ( + f"invalid checkpoint folder " f"path {checkpoint_folder}" + ) + models_paths = list( + filter(os.path.isfile, glob.glob(checkpoint_folder + "/*")) + ) + models_paths.sort(key=os.path.getmtime) + ind = previous_ckpt_ind + eval_interval + if ind < len(models_paths): + return models_paths[ind] + return None + + +def generate_video( + video_option: List[str], + video_dir: Optional[str], + images: List[np.ndarray], + scene_name: str, + sound: str, + sr: int, + episode_id: int, + checkpoint_idx: int, + metric_name: str, + metric_value: float, + tb_writer: TensorboardWriter, + fps: int = 10, + audios: List[str] = None, + text=None, + num_steps = 50, + qs_method = 'ours' +) -> None: + r"""Generate video according to specified information. + + Args: + video_option: string list of "tensorboard" or "disk" or both. + video_dir: path to target video directory. + images: list of images to be converted to video. + episode_id: episode id for video naming. + checkpoint_idx: checkpoint index for video naming. + metric_name: name of the performance metric, e.g. "spl". + metric_value: value of metric. + tb_writer: tensorboard writer object for uploading video. + fps: fps for generated video. + audios: raw audio files + Returns: + None + """ + if len(images) < 1: + return + + video_name = f"{scene_name}_{episode_id}_{sound}_{metric_name}{metric_value:.2f}_{qs_method}" + if "disk" in video_option: + assert video_dir is not None + + if audios is None: + # print('here') + images_to_video(images, video_dir, video_name) + else: + images_to_video_with_audio(images, video_dir, video_name, audios, sr, fps=fps, text=text, num_steps=num_steps ) + if "tensorboard" in video_option: + tb_writer.add_video_from_np_images( + f"episode{episode_id}", checkpoint_idx, images, fps=fps + ) + + +def plot_top_down_map(info, dataset='replica', pred=None): + top_down_map = info["top_down_map"]["map"] + top_down_map = maps.colorize_topdown_map( + top_down_map, info["top_down_map"]["fog_of_war_mask"] + ) + map_agent_pos = info["top_down_map"]["agent_map_coord"] + if dataset == 'replica': + agent_radius_px = top_down_map.shape[0] // 16 + else: + agent_radius_px = top_down_map.shape[0] // 50 + top_down_map = maps.draw_agent( + image=top_down_map, + agent_center_coord=map_agent_pos, + agent_rotation=info["top_down_map"]["agent_angle"], + agent_radius_px=agent_radius_px + ) + pred=None + if pred is not None: + from habitat.utils.geometry_utils import quaternion_rotate_vector + + # source_rotation = info["top_down_map"]["agent_rotation"] + source_rotation = quat_from_angle_axis(np.deg2rad(info["top_down_map"]["agent_angle"]), np.array([0, 1, 0])) + + + rounded_pred = np.round(pred[1]) + direction_vector_agent = np.array([rounded_pred[1], 0, -rounded_pred[0]]) + direction_vector = quaternion_rotate_vector(source_rotation, direction_vector_agent) + + grid_size = ( + (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000, + (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000, + ) + delta_x = int(-direction_vector[0] / grid_size[0]) + delta_y = int(direction_vector[2] / grid_size[1]) + + x = np.clip(map_agent_pos[0] + delta_x, a_min=0, a_max=top_down_map.shape[0]) + y = np.clip(map_agent_pos[1] + delta_y, a_min=0, a_max=top_down_map.shape[1]) + point_padding = 20 + for m in range(x - point_padding, x + point_padding + 1): + for n in range(y - point_padding, y + point_padding + 1): + if np.linalg.norm(np.array([m - x, n - y])) <= point_padding and \ + 0 <= m < top_down_map.shape[0] and 0 <= n < top_down_map.shape[1]: + top_down_map[m, n] = (0, 255, 255) + if np.linalg.norm(rounded_pred) < 1: + assert delta_x == 0 and delta_y == 0 + + if top_down_map.shape[0] > top_down_map.shape[1]: + top_down_map = np.rot90(top_down_map, 1) + return top_down_map + +def images_to_video_with_audio( + images: List[np.ndarray], + output_dir: str, + video_name: str, + audios: List[str], + sr: int, + fps: int = 1, + quality: Optional[float] = 5, + text=None, + num_steps = 50, + **kwargs +): + r"""Calls imageio to run FFMPEG on a list of images. For more info on + parameters, see https://imageio.readthedocs.io/en/stable/format_ffmpeg.html + Args: + images: The list of images. Images should be HxWx3 in RGB order. + output_dir: The folder to put the video in. + video_name: The name for the video. + audios: raw audio files + fps: Frames per second for the video. Not all values work with FFMPEG, + use at your own risk. + quality: Default is 5. Uses variable bit rate. Highest quality is 10, + lowest is 0. Set to None to prevent variable bitrate flags to + FFMPEG so you can manually specify them using output_params + instead. Specifying a fixed bitrate using ‘bitrate’ disables + this parameter. + """ + assert 0 <= quality <= 10 + if not os.path.exists(output_dir): + os.makedirs(output_dir) + if not os.path.exists(os.path.join(output_dir,'cv2')): + os.makedirs(os.path.join(output_dir,'cv2')) + + video_name = video_name.replace(" ", "_").replace("\n", "_") + ".mp4" + + if not os.path.exists(os.path.join(output_dir,'cv2', video_name)): + os.makedirs(os.path.join(output_dir,'cv2', video_name)) + + assert len(images) == len(audios) * fps + audio_clips = [] + temp_file_name = '/tmp/{}.wav'.format(random.randint(0, 10000)) + # use amplitude scaling factor to reduce the volume of sounds + amplitude_scaling_factor = 10 #100 + for i, audio in enumerate(audios): + # def f(t): + # return audio[0, t], audio[1: t] + # + # audio_clip = mpy.AudioClip(f, duration=1, fps=audio.shape[1]) + wavfile.write(temp_file_name, sr, audio.T / amplitude_scaling_factor) + audio_clip = mpy.AudioFileClip(temp_file_name) + audio_clip = audio_clip.set_duration(1) + audio_clip = audio_clip.set_start(i) + audio_clips.append(audio_clip) + composite_audio_clip = CompositeAudioClip(audio_clips) + video_clip = mpy.ImageSequenceClip(images, fps=fps) + + + ''' + # adding text + if text != None: + my_text = mpy.TextClip(text, fontsize=20, color='white') + my_text = my_text.set_duration(num_steps) + video_with_new_audio = mpy.CompositeVideoClip([video_with_new_audio, my_text]) + ''' + video_clip.write_videofile(os.path.join(output_dir, video_name)) + + # For inserting text + assert len(images)==len(text), 'image and text length are not same' + # inserting text + cap = cv2.VideoCapture(os.path.join(output_dir, video_name)) + frame_width = int(cap.get(3)) + frame_height = int(cap.get(4)) + frame_size = (frame_width,frame_height) + output = cv2.VideoWriter(os.path.join(output_dir, 'cv2', video_name, video_name), cv2.VideoWriter_fourcc('M','J','P','G'), fps, frame_size) + + #print(len(images),'len(images)', video_name) + # sys.exit() + + x1 = 30 + y1 = 40 + font = cv2.FONT_HERSHEY_SIMPLEX + fontscale = 1.2 + font_thickness = 2 + for cnt in range(len(images)): + ret, frame = cap.read() + # print(text[cnt],'text[cnt]') + if ret: + if cnt == len(images)-2: + cv2.imwrite(os.path.join('/home/sudipta/isavi/dialog_audionav/data/models/from_anoop/savi_15_5/run_2/video_dir', 'cv2', video_name, '{}.jpg'.format(cnt)), frame) + + if text[cnt]: + cv2.imwrite(os.path.join('/home/sudipta/isavi/dialog_audionav/data/models/from_anoop/savi_15_5/run_2/video_dir', 'cv2', video_name, '{}.jpg'.format(cnt)), frame) + (w, h), _ = cv2.getTextSize(text[cnt], font, fontscale, font_thickness) + cv2.rectangle(frame, (x1-10, y1-35), (x1+w+10, y1+h+5), (0, 255, 255), -1) + cv2.putText(frame, + text[cnt], + (x1, y1), + font, fontscale, + (255, 0, 0), + font_thickness, + cv2.LINE_4) + + if cv2.waitKey(1) & 0xFF == ord('q'): + break + output.write(frame) + + else: + break + + # sys.exit() + + + cap.release() + cv2.destroyAllWindows() + output.release() + + video_clip = mpy.VideoFileClip(os.path.join(output_dir, 'cv2', video_name, video_name)) + video_with_new_audio = video_clip.set_audio(composite_audio_clip) + video_with_new_audio.write_videofile(os.path.join(output_dir, video_name)) + + os.remove(temp_file_name) + + +def resize_observation(observations, model_resolution): + for observation in observations: + observation['rgb'] = cv2.resize(observation['rgb'], (model_resolution, model_resolution)) + observation['depth'] = np.expand_dims(cv2.resize(observation['depth'], (model_resolution, model_resolution)), + axis=-1) + + +def convert_semantics_to_rgb(semantics): + r"""Converts semantic IDs to RGB images. + """ + semantics = semantics.long() % 40 + mapping_rgb = torch.from_numpy(d3_40_colors_rgb).to(semantics.device) + semantics_r = torch.take(mapping_rgb[:, 0], semantics) + semantics_g = torch.take(mapping_rgb[:, 1], semantics) + semantics_b = torch.take(mapping_rgb[:, 2], semantics) + semantics_rgb = torch.stack([semantics_r, semantics_g, semantics_b], -1) + + return semantics_rgb + + +class ResizeCenterCropper(nn.Module): + def __init__(self, size, channels_last: bool = False): + r"""An nn module the resizes and center crops your input. + Args: + size: A sequence (w, h) or int of the size you wish to resize/center_crop. + If int, assumes square crop + channels_last: indicates if channels is the last dimension + """ + super().__init__() + if isinstance(size, numbers.Number): + size = (int(size), int(size)) + assert len(size) == 2, "forced input size must be len of 2 (w, h)" + self._size = size + self.channels_last = channels_last + + def transform_observation_space( + self, observation_space, trans_keys=["rgb", "depth", "semantic"] + ): + size = self._size + observation_space = copy.deepcopy(observation_space) + if size: + for key in observation_space.spaces: + if ( + key in trans_keys + and observation_space.spaces[key].shape != size + ): + logger.info( + "Overwriting CNN input size of %s: %s" % (key, size) + ) + observation_space.spaces[key] = overwrite_gym_box_shape( + observation_space.spaces[key], size + ) + self.observation_space = observation_space + return observation_space + + def forward(self, input: torch.Tensor) -> torch.Tensor: + if self._size is None: + return input + + return center_crop( + image_resize_shortest_edge( + input, max(self._size), channels_last=self.channels_last + ), + self._size, + channels_last=self.channels_last, + ) + + +def image_resize_shortest_edge( + img, size: int, channels_last: bool = False +) -> torch.Tensor: + """Resizes an img so that the shortest side is length of size while + preserving aspect ratio. + + Args: + img: the array object that needs to be resized (HWC) or (NHWC) + size: the size that you want the shortest edge to be resize to + channels: a boolean that channel is the last dimension + Returns: + The resized array as a torch tensor. + """ + img = to_tensor(img) + no_batch_dim = len(img.shape) == 3 + if len(img.shape) < 3 or len(img.shape) > 5: + raise NotImplementedError() + if no_batch_dim: + img = img.unsqueeze(0) # Adds a batch dimension + if channels_last: + h, w = img.shape[-3:-1] + if len(img.shape) == 4: + # NHWC -> NCHW + img = img.permute(0, 3, 1, 2) + else: + # NDHWC -> NDCHW + img = img.permute(0, 1, 4, 2, 3) + else: + # ..HW + h, w = img.shape[-2:] + + # Percentage resize + scale = size / min(h, w) + h = int(h * scale) + w = int(w * scale) + img = torch.nn.functional.interpolate( + img.float(), size=(h, w), mode="area" + ).to(dtype=img.dtype) + if channels_last: + if len(img.shape) == 4: + # NCHW -> NHWC + img = img.permute(0, 2, 3, 1) + else: + # NDCHW -> NDHWC + img = img.permute(0, 1, 3, 4, 2) + if no_batch_dim: + img = img.squeeze(dim=0) # Removes the batch dimension + return img + + +def center_crop(img, size, channels_last: bool = False): + """Performs a center crop on an image. + + Args: + img: the array object that needs to be resized (either batched or unbatched) + size: A sequence (w, h) or a python(int) that you want cropped + channels_last: If the channels are the last dimension. + Returns: + the resized array + """ + if channels_last: + # NHWC + h, w = img.shape[-3:-1] + else: + # NCHW + h, w = img.shape[-2:] + + if isinstance(size, numbers.Number): + size = (int(size), int(size)) + assert len(size) == 2, "size should be (h,w) you wish to resize to" + cropx, cropy = size + + startx = w // 2 - (cropx // 2) + starty = h // 2 - (cropy // 2) + if channels_last: + return img[..., starty : starty + cropy, startx : startx + cropx, :] + else: + return img[..., starty : starty + cropy, startx : startx + cropx] + + +def overwrite_gym_box_shape(box: Box, shape) -> Box: + if box.shape == shape: + return box + shape = list(shape) + list(box.shape[len(shape) :]) + low = box.low if np.isscalar(box.low) else np.min(box.low) + high = box.high if np.isscalar(box.high) else np.max(box.high) + return Box(low=low, high=high, shape=shape, dtype=box.dtype) + + +class NpEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.integer): + return int(obj) + elif isinstance(obj, np.floating): + return float(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return super(NpEncoder, self).default(obj) + + +def observations_to_image(observation: Dict, info: Dict, pred=None) -> np.ndarray: + r"""Generate image of single frame from observation and info + returned from a single environment step(). + + Args: + observation: observation returned from an environment step(). + info: info returned from an environment step(). + + Returns: + generated image of a single frame. + """ + egocentric_view = [] + if "rgb" in observation: + observation_size = observation["rgb"].shape[0] + rgb = observation["rgb"] + if not isinstance(rgb, np.ndarray): + rgb = rgb.cpu().numpy() + + egocentric_view.append(rgb) + ''' + # draw depth map if observation has depth info + if "depth" in observation: + observation_size = observation["depth"].shape[0] + depth_map = observation["depth"].squeeze() * 255.0 + if not isinstance(depth_map, np.ndarray): + depth_map = depth_map.cpu().numpy() + + depth_map = depth_map.astype(np.uint8) + depth_map = np.stack([depth_map for _ in range(3)], axis=2) + egocentric_view.append(depth_map) + ''' + assert ( + len(egocentric_view) > 0 + ), "Expected at least one visual sensor enabled." + egocentric_view = np.concatenate(egocentric_view, axis=1) + + # draw collision + if "collisions" in info and info["collisions"]["is_collision"]: + egocentric_view = draw_collision(egocentric_view) + + frame = egocentric_view + + if "top_down_map" in info: + top_down_map = info["top_down_map"]["map"] + top_down_map = maps.colorize_topdown_map( + top_down_map, info["top_down_map"]["fog_of_war_mask"] + ) + map_agent_pos = info["top_down_map"]["agent_map_coord"] + top_down_map = maps.draw_agent( + image=top_down_map, + agent_center_coord=map_agent_pos, + agent_rotation=info["top_down_map"]["agent_angle"], + agent_radius_px=top_down_map.shape[0] // 50, + ) + pred=None + if pred is not None: + from habitat.utils.geometry_utils import quaternion_rotate_vector + + # current_position = sim.get_agent_state().position + # agent_state = sim.get_agent_state() + source_rotation = info["top_down_map"]["agent_rotation"] + # needs a quaternion probably + # source_rotation = info["top_down_map"]["agent_angle"] + + rounded_pred = np.round(pred[1]) + direction_vector_agent = np.array([rounded_pred[1], 0, -rounded_pred[0]]) + direction_vector = quaternion_rotate_vector(source_rotation, direction_vector_agent) + # pred_goal_location = source_position + direction_vector.astype(np.float32) + + grid_size = ( + (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000, + (maps.COORDINATE_MAX - maps.COORDINATE_MIN) / 10000, + ) + delta_x = int(-direction_vector[0] / grid_size[0]) + delta_y = int(direction_vector[2] / grid_size[1]) + + x = np.clip(map_agent_pos[0] + delta_x, a_min=0, a_max=top_down_map.shape[0]) + y = np.clip(map_agent_pos[1] + delta_y, a_min=0, a_max=top_down_map.shape[1]) + point_padding = 12 + for m in range(x - point_padding, x + point_padding + 1): + for n in range(y - point_padding, y + point_padding + 1): + if np.linalg.norm(np.array([m - x, n - y])) <= point_padding and \ + 0 <= m < top_down_map.shape[0] and 0 <= n < top_down_map.shape[1]: + top_down_map[m, n] = (0, 255, 255) + if np.linalg.norm(rounded_pred) < 1: + assert delta_x == 0 and delta_y == 0 + + if top_down_map.shape[0] > top_down_map.shape[1]: + top_down_map = np.rot90(top_down_map, 1) + + # scale top down map to align with rgb view + if pred is None: + old_h, old_w, _ = top_down_map.shape + top_down_height = observation_size + top_down_width = int(float(top_down_height) / old_h * old_w) + # cv2 resize (dsize is width first) + top_down_map = cv2.resize( + top_down_map.astype(np.float32), + (top_down_width, top_down_height), + interpolation=cv2.INTER_CUBIC, + ) + else: + # draw label + CATEGORY_INDEX_MAPPING = { + 'chair': 0, + 'table': 1, + 'picture': 2, + 'cabinet': 3, + 'cushion': 4, + 'sofa': 5, + 'bed': 6, + 'chest_of_drawers': 7, + 'plant': 8, + 'sink': 9, + 'toilet': 10, + 'stool': 11, + 'towel': 12, + 'tv_monitor': 13, + 'shower': 14, + 'bathtub': 15, + 'counter': 16, + 'fireplace': 17, + 'gym_equipment': 18, + 'seating': 19, + 'clothes': 20 + } + index2label = {v: k for k, v in CATEGORY_INDEX_MAPPING.items()} + pred_label = index2label[pred[0]] + text_height = int(observation_size * 0.1) + + old_h, old_w, _ = top_down_map.shape + top_down_height = observation_size - text_height + top_down_width = int(float(top_down_height) / old_h * old_w) + # cv2 resize (dsize is width first) + top_down_map = cv2.resize( + top_down_map.astype(np.float32), + (top_down_width, top_down_height), + interpolation=cv2.INTER_CUBIC, + ) + + top_down_map = np.concatenate( + [np.ones([text_height, top_down_map.shape[1], 3], dtype=np.int32) * 255, top_down_map], axis=0) + top_down_map = cv2.putText(top_down_map, 'C_t: ' + pred_label.replace('_', ' '), (10, text_height - 10), + cv2.FONT_HERSHEY_SIMPLEX, 1.4, (0, 0, 0), 2, cv2.LINE_AA) + + frame = np.concatenate((egocentric_view, top_down_map), axis=1) + return frame diff --git a/ss_baselines/savi/README.md b/ss_baselines/savi/README.md new file mode 100644 index 0000000..db0655b --- /dev/null +++ b/ss_baselines/savi/README.md @@ -0,0 +1,45 @@ +# Semantic Audio-Visual Navigation (SAVi) Model + +## Details +This folder provides the code of the model as well as the training/evaluation configurations used in the +[Semantic Audio-Visual Navigation](https://arxiv.org/pdf/2012.11583.pdf) paper. +Use of this model is the similar as described in the usage section of the main README file. +Simply replace av_nav with savi in the command. + +Note that the numbers in the paper were initially reported on Habitat-Lab v0.1.5. Later versions of Habitat-Lab +seed the random seeds a bit differently. The difference of performance should be within 1%. +Pretrained weights are provided. + +## Usage +1. Pretrain the label predictor (or use the pretrained model weights from this repo): +``` +python ss_baselines/savi/pretraining/audiogoal_trainer.py --run-type train --model-dir data/models/savi --predict-label +``` +2. Train the SAVi model with the trained label predictor (location predictor is better trained online) with DDPPO. +Submit slurm.sh to your slurm cluster for training. If clusters are not available, use the following training command to train with PPO. +SAVi is first trained with the external memory size of 1, which only uses the last observation. +It is then fine-tuned with the whole external memory with encoders freezed. Please update the pretrained_weights path in savi.yaml with the best pretrained checkpoint when finetuning. +``` +python ss_baselines/savi/run.py --exp-config ss_baselines/savi/config/semantic_audionav/savi_pretraining.yaml --model-dir data/models/savi +python ss_baselines/savi/run.py --exp-config ss_baselines/savi/config/semantic_audionav/savi.yaml --model-dir data/models/savi +``` +3. Evaluating pretrained model +``` +python ss_baselines/savi/run.py --run-type eval --exp-config ss_baselines/savi/config/semantic_audionav/savi.yaml EVAL_CKPT_PATH_DIR data/pretrained_weights/semantic_audionav/savi/best_val.pth EVAL.SPLIT test USE_SYNC_VECENV True RL.DDPPO.pretrained False +``` + +## Citation +If you use this model in your research, please cite the following paper: +``` +@inproceedings{chen21semantic, + title = {Semantic Audio-Visual Navigation, + author = {Changan Chen and Ziad Al-Halah and Kristen Grauman}, + booktitle = {CVPR}, + year = {2021} +} +``` + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 diff --git a/ss_baselines/savi/__init__.py b/ss_baselines/savi/__init__.py new file mode 100644 index 0000000..ebc347e --- /dev/null +++ b/ss_baselines/savi/__init__.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from ss_baselines.savi.ddppo.algo.ddppo_trainer import DDPPOTrainer + +__all__ = ["BaseTrainer", "BaseRLTrainer", "PPOTrainer", "RolloutStorage"] \ No newline at end of file diff --git a/ss_baselines/savi/config/__init__.py b/ss_baselines/savi/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/savi/config/default.py b/ss_baselines/savi/config/default.py new file mode 100644 index 0000000..f7353de --- /dev/null +++ b/ss_baselines/savi/config/default.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Union +import os +import logging +import shutil + +import numpy as np + +from habitat import get_config as get_task_config +from habitat.config import Config as CN +import habitat +from habitat.config.default import SIMULATOR_SENSOR + +DEFAULT_CONFIG_DIR = "configs/" +CONFIG_FILE_SEPARATOR = "," +# ----------------------------------------------------------------------------- +# EXPERIMENT CONFIG +# ----------------------------------------------------------------------------- +_C = CN() +_C.SEED = 0 +_C.BASE_TASK_CONFIG_PATH = "configs/tasks/pointgoal.yaml" +_C.TASK_CONFIG = CN() # task_config will be stored as a config node +_C.CMD_TRAILING_OPTS = [] # store command line options as list of strings +_C.TRAINER_NAME = "savi" +_C.ENV_NAME = "AudioNavRLEnv" +_C.SIMULATOR_GPU_ID = 0 +_C.TORCH_GPU_ID = 0 +_C.VIDEO_OPTION = ["disk", "tensorboard"] +_C.VISUALIZATION_OPTION = ["top_down_map"] +_C.TENSORBOARD_DIR = "tb" +_C.VIDEO_DIR = "video_dir" +_C.TEST_EPISODE_COUNT = 2 +_C.EVAL_CKPT_PATH_DIR = "data/checkpoints" # path to ckpt or path to ckpts dir +_C.NUM_PROCESSES = 16 +_C.SENSORS = ["RGB_SENSOR", "DEPTH_SENSOR"] +_C.CHECKPOINT_FOLDER = "data/checkpoints" +_C.MODEL_DIR = 'data/models/output' +_C.NUM_UPDATES = 10000 +_C.NUM_UPDATES_DIALOG = 30000 +_C.LOG_INTERVAL = 10 +_C.LOG_FILE = "train.log" +_C.CHECKPOINT_INTERVAL = 50000 +_C.CHECKPOINT_INTERVAL_DIALOG = 1000 +_C.USE_VECENV = True +_C.USE_SYNC_VECENV = False +_C.EXTRA_RGB = False +_C.DEBUG = False +_C.USE_LAST_CKPT = False +_C.DISPLAY_RESOLUTION = 128 +_C.RESUME_CHECKPOINT = False +_C.ORACLE_WHEN_QUERIED= False +_C.REPLAY_STORE = False +_C.SOUND_TYPE = 'unheard' # just for logging in test case +# ----------------------------------------------------------------------------- +# EVAL CONFIG +# ----------------------------------------------------------------------------- +_C.EVAL = CN() +# The split to evaluate on +_C.EVAL.SPLIT = "val" +_C.EVAL.USE_CKPT_CONFIG = True +# ----------------------------------------------------------------------------- +# REINFORCEMENT LEARNING (RL) ENVIRONMENT CONFIG +# ----------------------------------------------------------------------------- +_C.RL = CN() +_C.RL.SUCCESS_REWARD = 10.0 +_C.RL.SLACK_REWARD = -0.01 +_C.RL.WITH_TIME_PENALTY = True +_C.RL.WITH_DISTANCE_REWARD = True +_C.RL.DISTANCE_REWARD_SCALE = 1.0 +_C.RL.TIME_DIFF = False + + +# unnecessary now +# ------------------- +# sudipta +_C.RL.QUERY_REWARD = -1.0 +_C.RL.CONSECUTIVE_REWARD = -.5 +_C.RL.QUERY_REWARD_DISTANCE = -1.0 +_C.RL.WITH_QUERY_CONSTRAINT = True +_C.RL.NUM_TOTAL_QUERY = 5 +_C.RL.SOFT_QUERY_REWARD = False +_C.RL.SOFT_QUERY_REWARD_MAX = -0.1 +_C.RL.DISTANCE_DISTRIBUTION_TYPE='gaussian' + +_C.RL.WITH_DISTANCE_CONSTRAINT = False +_C.RL.MEAN = 0.0 +_C.RL.SD = 0.5 +_C.RL.ALPHA = 2.0 +_C.RL.BETA = 2.0 + + +# ----------------------------------------------------------------------------- +# PROXIMAL POLICY OPTIMIZATION (PPO) +# ----------------------------------------------------------------------------- +_C.RL.PPO = CN() +_C.RL.PPO.clip_param = 0.2 +_C.RL.PPO.ppo_epoch = 4 +_C.RL.PPO.num_mini_batch = 2 +_C.RL.PPO.value_loss_coef = 0.5 +_C.RL.PPO.entropy_coef = 0.01 +_C.RL.PPO.lr = 7e-4 +_C.RL.PPO.eps = 1e-5 +_C.RL.PPO.max_grad_norm = 0.5 +_C.RL.PPO.num_steps = 5 +_C.RL.PPO.hidden_size = 512 +_C.RL.PPO.use_gae = True +_C.RL.PPO.use_linear_lr_decay = False +_C.RL.PPO.use_linear_clip_decay = False +_C.RL.PPO.gamma = 0.99 +_C.RL.PPO.tau = 0.95 +_C.RL.PPO.reward_window_size = 50 +_C.RL.PPO.use_normalized_advantage = False +_C.RL.PPO.policy_type = 'rnn' +_C.RL.PPO.use_external_memory = False + +# -------- +_C.RL.PPO.use_state_memory = False + +_C.RL.PPO.use_mlp_state_encoder = False +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER = CN() +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.memory_size = 300 +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.hidden_size = 128 +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.nhead = 8 +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.num_encoder_layers = 1 +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.num_decoder_layers = 1 +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.dropout = 0.0 +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.activation = 'relu' +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.use_pretrained = False +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.pretrained_path = '' +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.freeze_encoders = False +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.pretraining = False +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.use_action_encoding = True +_C.RL.PPO.SCENE_MEMORY_TRANSFORMER.use_belief_encoding = False +_C.RL.PPO.use_belief_predictor = False +_C.RL.PPO.BELIEF_PREDICTOR = CN() +_C.RL.PPO.BELIEF_PREDICTOR.online_training = False +_C.RL.PPO.BELIEF_PREDICTOR.lr = 1e-3 +_C.RL.PPO.BELIEF_PREDICTOR.audio_only = False +_C.RL.PPO.BELIEF_PREDICTOR.train_encoder = False +_C.RL.PPO.BELIEF_PREDICTOR.normalize_category_distribution = False +_C.RL.PPO.BELIEF_PREDICTOR.use_label_belief = True +_C.RL.PPO.BELIEF_PREDICTOR.use_location_belief = True +_C.RL.PPO.BELIEF_PREDICTOR.current_pred_only = False +_C.RL.PPO.BELIEF_PREDICTOR.weighting_factor = 0.5 +# ----------------------------------------------------------------------------- +# DECENTRALIZED DISTRIBUTED PROXIMAL POLICY OPTIMIZATION (DD-PPO) +# ----------------------------------------------------------------------------- +_C.RL.DDPPO = CN() +_C.RL.DDPPO.sync_frac = 0.6 +_C.RL.DDPPO.distrib_backend = "GLOO" +_C.RL.DDPPO.rnn_type = "LSTM" +_C.RL.DDPPO.num_recurrent_layers = 1 +_C.RL.DDPPO.backbone = "resnet50" +_C.RL.DDPPO.pretrained_weights = "" +# Loads pretrained weights +_C.RL.DDPPO.pretrained = False +# Whether or not to reset the critic linear layer +_C.RL.DDPPO.reset_critic = True + + + +# ------ sudipta +_C.RL.DDPPO.master_port = None + + + + +# EXTRA CONFIG FOR DIALOG +_C.DIALOG_TRAINING = False +_C.DIALOG_TRAINING_WITHOUT_DIALOG = False +_C.NUM_DIALOG_STEPS = 5 + +_C.QUERY_COUNT_EMB_SIZE = 32 +_C.AGENT_STEP_EMB_SIZE = 128 + +# ----------------------------------------------------------------------------- +# TASK CONFIG +# ----------------------------------------------------------------------------- +_TC = habitat.get_config() +_TC.defrost() +# ----------------------------------------------------------------------------- +# AUDIOGOAL_SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.AUDIOGOAL_SENSOR = CN() +_TC.TASK.AUDIOGOAL_SENSOR.TYPE = "AudioGoalSensor" +# ----------------------------------------------------------------------------- +# SPECTROGRAM_SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.SPECTROGRAM_SENSOR = CN() +_TC.TASK.SPECTROGRAM_SENSOR.TYPE = "SpectrogramSensor" +# ----------------------------------------------------------------------------- +# soundspaces +# ----------------------------------------------------------------------------- +_TC.SIMULATOR.GRID_SIZE = 0.5 +_TC.SIMULATOR.CONTINUOUS_VIEW_CHANGE = False +_TC.SIMULATOR.VIEW_CHANGE_FPS = 10 +_TC.SIMULATOR.SCENE_DATASET = 'replica' +_TC.SIMULATOR.USE_RENDERED_OBSERVATIONS = True +_TC.SIMULATOR.SCENE_OBSERVATION_DIR = 'data/scene_observations' +_TC.SIMULATOR.AUDIO = CN() +_TC.SIMULATOR.AUDIO.SCENE = "" +_TC.SIMULATOR.AUDIO.EVERLASTING = True +_TC.SIMULATOR.AUDIO.BINAURAL_RIR_DIR = "data/binaural_rirs" +_TC.SIMULATOR.AUDIO.RIR_SAMPLING_RATE = 44100 +_TC.SIMULATOR.AUDIO.SOURCE_SOUND_DIR = "data/sounds/1s_all" +_TC.SIMULATOR.AUDIO.METADATA_DIR = "data/metadata" +_TC.SIMULATOR.AUDIO.POINTS_FILE = 'points.txt' +_TC.SIMULATOR.AUDIO.GRAPH_FILE = 'graph.pkl' +_TC.SIMULATOR.AUDIO.HAS_DISTRACTOR_SOUND = False +_TC.SIMULATOR.AUDIO.DISTRACTOR_SOUND_DIR = 'data/sounds/1s_all_distractor' +# ----------------------------------------------------------------------------- +# DistanceToGoal Measure +# ----------------------------------------------------------------------------- +_TC.TASK.NORMALIZED_DISTANCE_TO_GOAL = CN() +_TC.TASK.NORMALIZED_DISTANCE_TO_GOAL.TYPE = "NormalizedDistanceToGoal" +# ----------------------------------------------------------------------------- +# Dataset extension +# ----------------------------------------------------------------------------- +_TC.DATASET.VERSION = 'v1' +# ----------------------------------------------------------------------------- +# NumberOfAction Measure +# ----------------------------------------------------------------------------- +_TC.TASK.NUM_ACTION = CN() +_TC.TASK.NUM_ACTION.TYPE = "NA" +_TC.TASK.SUCCESS_WEIGHTED_BY_NUM_ACTION = CN() +_TC.TASK.SUCCESS_WEIGHTED_BY_NUM_ACTION.TYPE = "SNA" +_TC.TASK.VIEW_POINT_GOALS = CN() +_TC.TASK.VIEW_POINT_GOALS.TYPE = "ViewPointGoals" +# ----------------------------------------------------------------------------- +# Intensity estimated from ambisonic +# ----------------------------------------------------------------------------- +_TC.TASK.CATEGORY = SIMULATOR_SENSOR.clone() +_TC.TASK.CATEGORY.TYPE = "Category" +_TC.TASK.CATEGORY_BELIEF = SIMULATOR_SENSOR.clone() +_TC.TASK.CATEGORY_BELIEF.TYPE = "CategoryBelief" +_TC.TASK.LOCATION_BELIEF = SIMULATOR_SENSOR.clone() +_TC.TASK.LOCATION_BELIEF.TYPE = "LocationBelief" +_TC.TASK.SUCCESS_WHEN_SILENT = CN() +_TC.TASK.SUCCESS_WHEN_SILENT.TYPE = "SWS" +# ----------------------------------------------------------------------------- +# POSE SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.POSE_SENSOR = CN() +_TC.TASK.POSE_SENSOR.TYPE = "PoseSensor" +# ----------------------------------------------------------------------------- +# SEMANTIC OBJECT SENSOR +# ----------------------------------------------------------------------------- +_TC.TASK.SEMANTIC_OBJECT_SENSOR = CN() +_TC.TASK.SEMANTIC_OBJECT_SENSOR.TYPE = "SemanticObjectSensor" +_TC.TASK.SEMANTIC_OBJECT_SENSOR.HEIGHT = 128 +_TC.TASK.SEMANTIC_OBJECT_SENSOR.WIDTH = 128 +_TC.TASK.SEMANTIC_OBJECT_SENSOR.HFOV = 90 # horizontal field of view in degrees +_TC.TASK.SEMANTIC_OBJECT_SENSOR.POSITION = [0, 1.25, 0] +_TC.TASK.SEMANTIC_OBJECT_SENSOR.ORIENTATION = [0.0, 0.0, 0.0] # Euler's angles +_TC.TASK.SEMANTIC_OBJECT_SENSOR.CONVERT_TO_RGB = True +_TC.TASK.ORACLE_ACTION_SENSOR = CN() +_TC.TASK.ORACLE_ACTION_SENSOR.TYPE = "OracleActionSensor" + +''' +# ----------------------------------------------------------------------------- +# ACTION & SIMULATOR for QUERY (sudipta) +# ----------------------------------------------------------------------------- +_TC.TASK.ACTIONS.QUERY = CN() +_TC.TASK.ACTIONS.QUERY.TYPE = "QueryAction" +_TC.SIMULATOR.QUERY_STEP = 2.0 + +# ----------------------------------------------------------------------------- +# OTHER (sudipta) +# ----------------------------------------------------------------------------- +_TC.TASK.POSSIBLE_ACTIONS = ['STOP', 'MOVE_FORWARD', 'TURN_LEFT', 'TURN_RIGHT', 'QUERY'] +''' + +def merge_from_path(config, config_paths): + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + return config + + +def get_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None, + model_dir: Optional[str] = None, + run_type: Optional[str] = None, + overwrite: bool = False +) -> CN: + r"""Create a unified config with default values overwritten by values from + `config_paths` and overwritten by options from `opts`. + Args: + config_paths: List of config paths or string that contains comma + separated list of config paths. + opts: Config options (keys, values) in a list (e.g., passed from + command line into the config. For example, `opts = ['FOO.BAR', + 0.5]`. Argument can be used for parameter sweeping or quick tests. + model_dir: suffix for output dirs + run_type: either train or eval + """ + config = merge_from_path(_C.clone(), config_paths) + config.TASK_CONFIG = get_task_config(config_paths=config.BASE_TASK_CONFIG_PATH) + + # config_name = os.path.basename(config_paths).split('.')[0] + if model_dir is not None: + config.MODEL_DIR = model_dir + config.TENSORBOARD_DIR = os.path.join(config.MODEL_DIR, 'tb') + config.CHECKPOINT_FOLDER = os.path.join(config.MODEL_DIR, 'data') + config.VIDEO_DIR = os.path.join(config.MODEL_DIR, 'video_dir') + config.LOG_FILE = os.path.join(config.MODEL_DIR, 'train.log') + config.EVAL_CKPT_PATH_DIR = os.path.join(config.MODEL_DIR, 'data') + + if opts: + config.CMD_TRAILING_OPTS = opts + config.merge_from_list(opts) + + dirs = [config.VIDEO_DIR, config.TENSORBOARD_DIR, config.CHECKPOINT_FOLDER] + if run_type == 'train': + # check dirs + if any([os.path.exists(d) for d in dirs]): + for d in dirs: + if os.path.exists(d): + logging.warning('{} exists'.format(d)) + # if overwrite or input('Output directory already exists! Overwrite the folder? (y/n)') == 'y': + if overwrite: + for d in dirs: + if os.path.exists(d): + shutil.rmtree(d) + else: + # overwrite training configs + config.defrost() + + if not config.DIALOG_TRAINING: + config.NUM_PROCESSES = 10 + if config.EVAL.SPLIT.startswith('val'): + config.USE_SYNC_VECENV = True + config.TEST_EPISODE_COUNT = 500 + elif config.EVAL.SPLIT.startswith('test'): + config.TEST_EPISODE_COUNT = 1000 + else: + raise ValueError('Dataset split must starts with train, val or test!') + else: + config.NUM_PROCESSES = 8 + if config.EVAL.SPLIT == 'val': + config.TEST_EPISODE_COUNT = 7051 + elif config.EVAL.SPLIT == 'train': + config.TEST_EPISODE_COUNT = 77516 + config.freeze() + + config.TASK_CONFIG.defrost() + config.TASK_CONFIG.SIMULATOR.USE_SYNC_VECENV = config.USE_SYNC_VECENV + config.TASK_CONFIG.freeze() + config.freeze() + return config + + +def get_task_config( + config_paths: Optional[Union[List[str], str]] = None, + opts: Optional[list] = None +) -> habitat.Config: + config = _TC.clone() + if config_paths: + if isinstance(config_paths, str): + if CONFIG_FILE_SEPARATOR in config_paths: + config_paths = config_paths.split(CONFIG_FILE_SEPARATOR) + else: + config_paths = [config_paths] + + for config_path in config_paths: + config.merge_from_file(config_path) + + if opts: + config.merge_from_list(opts) + + config.freeze() + return config diff --git a/ss_baselines/savi/config/semantic_audionav/savi.yaml b/ss_baselines/savi/config/semantic_audionav/savi.yaml new file mode 100644 index 0000000..79d5131 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi.yaml @@ -0,0 +1,62 @@ +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: True + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + # choose the best pretrained SAVi based on validation curve + pretrained_weights: "data/models/savi/data/ckpt.XXX.pth" + pretrained: True + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav/savi_interactive_1st_stage.yaml b/ss_baselines/savi/config/semantic_audionav/savi_interactive_1st_stage.yaml new file mode 100644 index 0000000..24a4d41 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_interactive_1st_stage.yaml @@ -0,0 +1,91 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 6 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 3000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/vln_ckpt.1.pth" +# VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29.pth" +# VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29_fix_bp.pth" + +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/best_val.pth" +NUM_DIALOG_STEPS: 3 +REPLAY_STORE: False +ORACLE_WHEN_QUERIED: True +QUERY_WITHIN_RADIUS: True +ALLOW_STOP: False + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + WITH_QUERY_CONSTRAINT: True + WITH_DISTANCE_CONSTRAINT: False + CONSECUTIVE_CONSTRAINT_REWARD: True + CONSECUTIVE_REWARD: -.5 + QUERY_REWARD: -.2 + QUERY_REWARD_DISTANCE: -.2 + NUM_TOTAL_QUERY: 3 + SOFT_QUERY_REWARD: False + SOFT_QUERY_REWARD_MAX: -1.0 + SUCCESS_REWARD: 10.0 + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "interactive" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + dropout_goal: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: True + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav/savi_interactive_2nd_stage.yaml b/ss_baselines/savi/config/semantic_audionav/savi_interactive_2nd_stage.yaml new file mode 100644 index 0000000..b747e34 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_interactive_2nd_stage.yaml @@ -0,0 +1,91 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 6 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 6000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/vln_ckpt.1.pth" +# VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29.pth" +# VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29_fix_bp.pth" + +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/best_val.pth" +NUM_DIALOG_STEPS: 3 +REPLAY_STORE: False +ORACLE_WHEN_QUERIED: True +QUERY_WITHIN_RADIUS: True +ALLOW_STOP: False + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + WITH_QUERY_CONSTRAINT: True + WITH_DISTANCE_CONSTRAINT: False + CONSECUTIVE_CONSTRAINT_REWARD: True + CONSECUTIVE_REWARD: -.5 + QUERY_REWARD: -.2 + QUERY_REWARD_DISTANCE: -.2 + NUM_TOTAL_QUERY: 3 + SOFT_QUERY_REWARD: False + SOFT_QUERY_REWARD_MAX: -1.0 + SUCCESS_REWARD: 10.0 + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "interactive" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + dropout_goal: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav/savi_no_label.yaml b/ss_baselines/savi/config/semantic_audionav/savi_no_label.yaml new file mode 100644 index 0000000..bdb5e84 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_no_label.yaml @@ -0,0 +1,61 @@ +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: True + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: False + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "data/models/savi/data/ckpt.XXX.pth" + pretrained: True + reset_critic: False \ No newline at end of file diff --git a/ss_baselines/savi/config/semantic_audionav/savi_no_location.yaml b/ss_baselines/savi/config/semantic_audionav/savi_no_location.yaml new file mode 100644 index 0000000..a7459bc --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_no_location.yaml @@ -0,0 +1,65 @@ +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: True + pretraining: False + BELIEF_PREDICTOR: + online_training: False + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: False + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "data/models/savi/data/ckpt.XXX.pth" + pretrained: True + reset_critic: False \ No newline at end of file diff --git a/ss_baselines/savi/config/semantic_audionav/savi_pretraining.yaml b/ss_baselines/savi/config/semantic_audionav/savi_pretraining.yaml new file mode 100644 index 0000000..c9f2a18 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_pretraining.yaml @@ -0,0 +1,69 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 1 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: True + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog.yaml b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog.yaml new file mode 100644 index 0000000..418fbff --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog.yaml @@ -0,0 +1,69 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_dialog.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 1 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: True + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training.yaml b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training.yaml new file mode 100644 index 0000000..96668f1 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training.yaml @@ -0,0 +1,74 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_dialog.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +DIALOG_TRAINING: True +NUM_DIALOG_STEPS: 5 +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/best_val.pth" + + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "dialog" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 10 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: True + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training_without_dialog.yaml b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training_without_dialog.yaml new file mode 100644 index 0000000..57276ba --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_dialog_training_without_dialog.yaml @@ -0,0 +1,74 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_dialog.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +DIALOG_TRAINING: True +DIALOG_TRAINING_WITHOUT_DIALOG: True +NUM_DIALOG_STEPS: 5 +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/best_val.pth" + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "dialog" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 10 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: True + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav/savi_pretraining_interactive.yaml b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_interactive.yaml new file mode 100644 index 0000000..c447c14 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav/savi_pretraining_interactive.yaml @@ -0,0 +1,92 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 10 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +#VLN_CKPT_PATH: "data/models/savi/data/vln/ckpt.1.pth" +VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/vln_ckpt.1.pth" +#VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29.pth" +#VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29_fix_bp.pth" + +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/best_val.pth" +NUM_DIALOG_STEPS: 3 +REPLAY_STORE: False +ORACLE_WHEN_QUERIED: True +QUERY_WITHIN_RADIUS: True +ALLOW_STOP: False + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + WITH_QUERY_CONSTRAINT: True + WITH_DISTANCE_CONSTRAINT: False + CONSECUTIVE_CONSTRAINT_REWARD: True + CONSECUTIVE_REWARD: -.5 + QUERY_REWARD: -.2 + QUERY_REWARD_DISTANCE: -.2 + NUM_TOTAL_QUERY: 3 + SOFT_QUERY_REWARD: False + SOFT_QUERY_REWARD_MAX: -1.0 + SUCCESS_REWARD: 10.0 + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "interactive" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + dropout_goal: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav_distractor/savi.yaml b/ss_baselines/savi/config/semantic_audionav_distractor/savi.yaml new file mode 100644 index 0000000..17bccbd --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav_distractor/savi.yaml @@ -0,0 +1,61 @@ +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: True + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + # choose the best pretrained SAVi based on validation curve + pretrained_weights: "data/models/savi/data/ckpt.XXX.pth" + pretrained: True + reset_critic: False \ No newline at end of file diff --git a/ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_1st_stage.yaml b/ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_1st_stage.yaml new file mode 100644 index 0000000..911cce8 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_1st_stage.yaml @@ -0,0 +1,90 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 6 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 3000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +#VLN_CKPT_PATH: "data/models/savi/data/vln/ckpt.1.pth" +# VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29.pth" +VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav_distractor/savi/vln/ckpt.29_fix_bp.pth" +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav_distractor/savi/ckpt.173.pth" +NUM_DIALOG_STEPS: 3 +REPLAY_STORE: False +ORACLE_WHEN_QUERIED: True +QUERY_WITHIN_RADIUS: True +ALLOW_STOP: False + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + WITH_QUERY_CONSTRAINT: True + WITH_DISTANCE_CONSTRAINT: False + CONSECUTIVE_CONSTRAINT_REWARD: True + CONSECUTIVE_REWARD: -.5 + QUERY_REWARD: -.2 + QUERY_REWARD_DISTANCE: -.2 + NUM_TOTAL_QUERY: 3 + SOFT_QUERY_REWARD: False + SOFT_QUERY_REWARD_MAX: -1.0 + SUCCESS_REWARD: 10.0 + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "interactive" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + dropout_goal: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: True + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_2nd_stage.yaml b/ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_2nd_stage.yaml new file mode 100644 index 0000000..e6e0450 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav_distractor/savi_interactive_2nd_stage.yaml @@ -0,0 +1,90 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 6 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 6000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +#VLN_CKPT_PATH: "data/models/savi/data/vln/ckpt.1.pth" +# VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29.pth" +VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav_distractor/savi/vln/ckpt.29_fix_bp.pth" +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav_distractor/savi/ckpt.173.pth" +NUM_DIALOG_STEPS: 3 +REPLAY_STORE: False +ORACLE_WHEN_QUERIED: True +QUERY_WITHIN_RADIUS: True +ALLOW_STOP: False + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + WITH_QUERY_CONSTRAINT: True + WITH_DISTANCE_CONSTRAINT: False + CONSECUTIVE_CONSTRAINT_REWARD: True + CONSECUTIVE_REWARD: -.5 + QUERY_REWARD: -.2 + QUERY_REWARD_DISTANCE: -.2 + NUM_TOTAL_QUERY: 3 + SOFT_QUERY_REWARD: False + SOFT_QUERY_REWARD_MAX: -1.0 + SUCCESS_REWARD: 10.0 + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "interactive" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + dropout_goal: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/config/semantic_audionav_distractor/savi_no_label.yaml b/ss_baselines/savi/config/semantic_audionav_distractor/savi_no_label.yaml new file mode 100644 index 0000000..09bf6c7 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav_distractor/savi_no_label.yaml @@ -0,0 +1,61 @@ +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: True + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: False + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "data/models/savi/data/ckpt.XXX.pth" + pretrained: True + reset_critic: False \ No newline at end of file diff --git a/ss_baselines/savi/config/semantic_audionav_distractor/savi_no_location.yaml b/ss_baselines/savi/config/semantic_audionav_distractor/savi_no_location.yaml new file mode 100644 index 0000000..15e454f --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav_distractor/savi_no_location.yaml @@ -0,0 +1,65 @@ +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: True + pretraining: False + BELIEF_PREDICTOR: + online_training: False + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: False + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "data/models/savi/data/ckpt.XXX.pth" + pretrained: True + reset_critic: False \ No newline at end of file diff --git a/ss_baselines/savi/config/semantic_audionav_distractor/savi_pretraining.yaml b/ss_baselines/savi/config/semantic_audionav_distractor/savi_pretraining.yaml new file mode 100644 index 0000000..34a134f --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav_distractor/savi_pretraining.yaml @@ -0,0 +1,65 @@ +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 8 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "smt" + use_belief_predictor: True + use_external_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 1 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: True + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False \ No newline at end of file diff --git a/ss_baselines/savi/config/semantic_audionav_distractor/savi_pretraining_interactive.yaml b/ss_baselines/savi/config/semantic_audionav_distractor/savi_pretraining_interactive.yaml new file mode 100644 index 0000000..be9c555 --- /dev/null +++ b/ss_baselines/savi/config/semantic_audionav_distractor/savi_pretraining_interactive.yaml @@ -0,0 +1,90 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# +# SPDX-License-Identifier: AGPL-3.0-or-later + +BASE_TASK_CONFIG_PATH: "configs/semantic_audionav/savi/mp3d/semantic_audiogoal_distractor.yaml" +TRAINER_NAME: "ddppo" +NUM_PROCESSES: 6 +SENSORS: ["DEPTH_SENSOR", "RGB_SENSOR"] +NUM_UPDATES: 20000 +LOG_INTERVAL: 10 +CHECKPOINT_INTERVAL: 50 +VIDEO_OPTION: [] +VISUALIZATION_OPTION: [] +#VLN_CKPT_PATH: "data/models/savi/data/vln/ckpt.1.pth" +# VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav/savi/vln/ckpt.29.pth" +VLN_CKPT_PATH: "data/pretrained_weights/semantic_audionav_distractor/savi/vln/ckpt.29_fix_bp.pth" +GOAL_CKPT_PATH: "data/pretrained_weights/semantic_audionav_distractor/savi/ckpt.173.pth" +NUM_DIALOG_STEPS: 3 +REPLAY_STORE: False +ORACLE_WHEN_QUERIED: True +QUERY_WITHIN_RADIUS: True +ALLOW_STOP: False + +EVAL: + SPLIT: "val" + USE_CKPT_CONFIG: True + +RL: + WITH_QUERY_CONSTRAINT: True + WITH_DISTANCE_CONSTRAINT: False + CONSECUTIVE_CONSTRAINT_REWARD: True + CONSECUTIVE_REWARD: -.5 + QUERY_REWARD: -.2 + QUERY_REWARD_DISTANCE: -.2 + NUM_TOTAL_QUERY: 3 + SOFT_QUERY_REWARD: False + SOFT_QUERY_REWARD_MAX: -1.0 + SUCCESS_REWARD: 10.0 + PPO: + clip_param: 0.2 + ppo_epoch: 2 + num_mini_batch: 2 + value_loss_coef: 0.5 + entropy_coef: 0.05 + lr: 2.5e-4 + eps: 1e-5 + max_grad_norm: 0.2 + # decide the length of history that ppo encodes + num_steps: 150 + hidden_size: 512 + use_gae: True + gamma: 0.99 + tau: 0.95 + use_linear_clip_decay: False + use_linear_lr_decay: False + # window size for calculating the past rewards + reward_window_size: 50 + use_normalized_advantage: False + policy_type: "interactive" + use_belief_predictor: True + use_external_memory: True + use_state_memory: True + SCENE_MEMORY_TRANSFORMER: + memory_size: 150 + hidden_size: 256 + nhead: 8 + num_encoder_layers: 1 + num_decoder_layers: 1 + dropout: 0.0 + dropout_goal: 0.0 + activation: 'relu' + use_pretrained: False + pretrained_path: '' + freeze_encoders: False + pretraining: False + BELIEF_PREDICTOR: + online_training: True + train_encoder: False + lr: 1e-3 + use_label_belief: True + use_location_belief: True + DDPPO: + sync_frac: 0.6 + distrib_backend: "GLOO" + rnn_type: "GRU" + num_recurrent_layers: 1 + backbone: "custom_resnet18" + pretrained_weights: "" + pretrained: False + reset_critic: False diff --git a/ss_baselines/savi/ddppo/__init__.py b/ss_baselines/savi/ddppo/__init__.py new file mode 100644 index 0000000..195cc30 --- /dev/null +++ b/ss_baselines/savi/ddppo/__init__.py @@ -0,0 +1,11 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +from ss_baselines.savi.ddppo.algo import DDPPOTrainer \ No newline at end of file diff --git a/ss_baselines/savi/ddppo/algo/__init__.py b/ss_baselines/savi/ddppo/algo/__init__.py new file mode 100644 index 0000000..eca118e --- /dev/null +++ b/ss_baselines/savi/ddppo/algo/__init__.py @@ -0,0 +1 @@ +from ss_baselines.savi.ddppo.algo.ddppo_trainer import DDPPOTrainer \ No newline at end of file diff --git a/ss_baselines/savi/ddppo/algo/ddp_utils.py b/ss_baselines/savi/ddppo/algo/ddp_utils.py new file mode 100644 index 0000000..03d6aa4 --- /dev/null +++ b/ss_baselines/savi/ddppo/algo/ddp_utils.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import os +import os.path as osp +import shlex +import signal +import subprocess +import threading +from typing import Any, Optional, Tuple +import random + +import ifcfg +import torch +import torch.distributed as distrib + +from habitat import logger + +EXIT = threading.Event() +EXIT.clear() +REQUEUE = threading.Event() +REQUEUE.clear() + + +# Default port to initialized the TCP store on +DEFAULT_PORT = 0#8738 +# Default address of world rank 0 +DEFAULT_MASTER_ADDR = "169.235.18.95"#"127.0.0.1" + +SLURM_JOBID = os.environ.get("SLURM_JOB_ID", None) +INTERRUPTED_STATE_FILE = osp.join( + os.environ["HOME"], ".interrupted_states", f"{SLURM_JOBID}.pth" +) + + +def _clean_exit_handler(signum, frame): + EXIT.set() + print("Exiting cleanly", flush=True) + + +def _requeue_handler(signal, frame): + print("Got signal to requeue", flush=True) + EXIT.set() + REQUEUE.set() + + +def add_signal_handlers(): + signal.signal(signal.SIGINT, _clean_exit_handler) + signal.signal(signal.SIGTERM, _clean_exit_handler) + + # SIGUSR2 can be sent to all processes to have them cleanup + # and exit nicely. This is nice to use with SLURM as scancel + # sets a 30 second timer for the job to exit, and it can take more than + # 30 seconds for the job to cleanup and exit nicely. When using NCCL, + # forcing the job to exit without cleaning up can be bad. + # scancel --signal SIGUSR2 will set no such timer and will give + # the job ample time to cleanup and exit. + signal.signal(signal.SIGUSR2, _clean_exit_handler) + + signal.signal(signal.SIGUSR1, _requeue_handler) + + +def save_interrupted_state(state: Any, filename: str = None): + r"""Saves the interrupted job state to the specified filename. + This is useful when working with preemptable job partitions. + + This method will do nothing if SLURM is not currently being used and the filename is the default + + :param state: The state to save + :param filename: The filename. Defaults to "${HOME}/.interrupted_states/${SLURM_JOBID}.pth" + """ + if SLURM_JOBID is None and filename is None: + logger.warn("SLURM_JOBID is none, not saving interrupted state") + return + + if filename is None: + filename = INTERRUPTED_STATE_FILE + + torch.save(state, filename) + + +def load_interrupted_state(filename: str = None) -> Optional[Any]: + r"""Loads the saved interrupted state + + :param filename: The filename of the saved state. + Defaults to "${HOME}/.interrupted_states/${SLURM_JOBID}.pth" + + :return: The saved state if the file exists, else none + """ + if SLURM_JOBID is None and filename is None: + return None + + if filename is None: + filename = INTERRUPTED_STATE_FILE + + if not osp.exists(filename): + return None + + return torch.load(filename, map_location="cpu") + + +def requeue_job(): + r"""Requeues the job by calling ``scontrol requeue ${SLURM_JOBID}`` + """ + if SLURM_JOBID is None: + return + + if not REQUEUE.is_set(): + return + + distrib.barrier() + + if distrib.get_rank() == 0: + logger.info(f"Requeueing job {SLURM_JOBID}") + subprocess.check_call(shlex.split(f"scontrol requeue {SLURM_JOBID}")) + + +def get_ifname(): + return ifcfg.default_interface()["device"] + + +def init_distrib_slurm( + backend: str = "nccl", +) -> Tuple[int, torch.distributed.TCPStore]: + r"""Initializes torch.distributed by parsing environment variables set + by SLURM when ``srun`` is used or by parsing environment variables set + by torch.distributed.launch + + :param backend: Which torch.distributed backend to use + + :returns: Tuple of the local_rank (aka which GPU to use for this process) + and the TCPStore used for the rendezvous + """ + assert ( + torch.distributed.is_available() + ), "torch.distributed must be available" + + if "GLOO_SOCKET_IFNAME" not in os.environ: + os.environ["GLOO_SOCKET_IFNAME"] = get_ifname() + + if "NCCL_SOCKET_IFNAME" not in os.environ: + os.environ["NCCL_SOCKET_IFNAME"] = get_ifname() + + master_port = int(os.environ.get("MASTER_PORT", DEFAULT_PORT)) + master_addr = os.environ.get("MASTER_ADDR", DEFAULT_MASTER_ADDR) + + # Check to see if we should parse from torch.distributed.launch + if os.environ.get("LOCAL_RANK", None) is not None: + #print('os.environ.get(LOCAL_RANK, None)') + local_rank = int(os.environ["LOCAL_RANK"]) + world_rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + # Else parse from SLURM is using SLURM + elif os.environ.get("SLURM_JOBID", None) is not None: + #print('os.environ.get(SLURM_JOBID)', os.environ.get('SLURM_JOBID')) + local_rank = int(os.environ["SLURM_LOCALID"]) + world_rank = int(os.environ["SLURM_PROCID"]) + world_size = int(os.environ["SLURM_NTASKS"]) + # Otherwise setup for just 1 process, this is nice for testing + else: + local_rank = 0 + world_rank = 0 + world_size = 1 + + # print('local_rank, world_rank, world_size', local_rank, world_rank, world_size) + # print('master_addr, master_port', master_addr, master_port) + + tcp_store = distrib.TCPStore( + master_addr, master_port, world_size, world_rank == 0 + ) + distrib.init_process_group( + backend, store=tcp_store, rank=world_rank, world_size=world_size + ) + + return local_rank, tcp_store diff --git a/ss_baselines/savi/ddppo/algo/ddppo.py b/ss_baselines/savi/ddppo/algo/ddppo.py new file mode 100644 index 0000000..9c6b2a5 --- /dev/null +++ b/ss_baselines/savi/ddppo/algo/ddppo.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Tuple + +import torch +import torch.distributed as distrib + +from ss_baselines.savi.models.rollout_storage import RolloutStorage +from ss_baselines.savi.ppo.ppo import PPO + +EPS_PPO = 1e-5 + + +def distributed_mean_and_var( + values: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + r"""Computes the mean and variances of a tensor over multiple workers. + + This method is equivalent to first collecting all versions of values and + then computing the mean and variance locally over that + + :param values: (*,) shaped tensors to compute mean and variance over. Assumed + to be solely the workers local copy of this tensor, + the resultant mean and variance will be computed + over _all_ workers version of this tensor. + """ + assert distrib.is_initialized(), "Distributed must be initialized" + + world_size = distrib.get_world_size() + mean = values.mean() + distrib.all_reduce(mean) + mean /= world_size + + sq_diff = (values - mean).pow(2).mean() + distrib.all_reduce(sq_diff) + var = sq_diff / world_size + + return mean, var + + +class DecentralizedDistributedMixin: + def _get_advantages_distributed( + self, rollouts: RolloutStorage + ) -> torch.Tensor: + advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] + if not self.use_normalized_advantage: + return advantages + + mean, var = distributed_mean_and_var(advantages) + + return (advantages - mean) / (var.sqrt() + EPS_PPO) + + def init_distributed(self, find_unused_params: bool = True) -> None: + r"""Initializes distributed training for the model + + 1. Broadcasts the model weights from world_rank 0 to all other workers + 2. Adds gradient hooks to the model + + :param find_unused_params: Whether or not to filter out unused parameters + before gradient reduction. This *must* be True if + there are any parameters in the model that where unused in the + forward pass, otherwise the gradient reduction + will not work correctly. + """ + # NB: Used to hide the hooks from the nn.Module, + # so they don't show up in the state_dict + class Guard: + def __init__(self, model, device): + if torch.cuda.is_available(): + self.ddp = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[device], output_device=device + ) + else: + self.ddp = torch.nn.parallel.DistributedDataParallel(model) + + self._ddp_hooks = Guard(self.actor_critic, self.device) + self.get_advantages = self._get_advantages_distributed + + self.reducer = self._ddp_hooks.ddp.reducer + self.find_unused_params = find_unused_params + + def before_backward(self, loss): + super().before_backward(loss) + + if self.find_unused_params: + self.reducer.prepare_for_backward([loss]) + else: + self.reducer.prepare_for_backward([]) + + +class DDPPO(DecentralizedDistributedMixin, PPO): + pass diff --git a/ss_baselines/savi/ddppo/algo/ddppo_trainer.py b/ss_baselines/savi/ddppo/algo/ddppo_trainer.py new file mode 100644 index 0000000..3c4c6f9 --- /dev/null +++ b/ss_baselines/savi/ddppo/algo/ddppo_trainer.py @@ -0,0 +1,1200 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +import contextlib +import os +import random +import time +from collections import defaultdict, deque +import sys +from copy import deepcopy +import math + +import numpy as np +import torch +import torch.distributed as distrib +import torch.nn as nn +from torch.optim.lr_scheduler import LambdaLR +from torch.optim.lr_scheduler import CosineAnnealingLR + +from habitat import Config, logger +from ss_baselines.common.baseline_registry import baseline_registry +from ss_baselines.common.env_utils import construct_envs +from ss_baselines.common.environments import get_env_class +from ss_baselines.savi.models.rollout_storage import RolloutStorage +from ss_baselines.common.tensorboard_utils import TensorboardWriter +from ss_baselines.common.utils import batch_obs, linear_decay +from ss_baselines.savi.ddppo.algo.ddp_utils import ( + EXIT, + REQUEUE, + add_signal_handlers, + init_distrib_slurm, + load_interrupted_state, + requeue_job, + save_interrupted_state, +) + +from ss_baselines.savi.ddppo.algo.ddppo import DDPPO +from ss_baselines.savi.models.belief_predictor import BeliefPredictor, BeliefPredictorDDP +from ss_baselines.savi.ppo.ppo_trainer import PPOTrainer +from ss_baselines.savi.ppo.policy import AudioNavSMTPolicy, AudioNavBaselinePolicy, AudioNavDialogPolicy, AudioNavOptionPolicy +sys.path.append('./ss_baselines/savi/dialog/speaker/build/') +sys.path.append('./ss_baselines/savi/dialog/speaker/') +sys.path.append('./ss_baselines/savi/dialog/speaker/tasks/R2R/') +from ss_baselines.savi.dialog.speaker.tasks.R2R.speaker_pipeline import Speaker, SpeakerDDP +from ss_baselines.savi.dialog.ques_gen.utils.train_utils import Vocabulary +vocab_path = './ss_baselines/savi/dialog/ques_gen/processed/vocab_iq_vln.json' + +import pynvml +from pynvml.smi import nvidia_smi +pynvml.nvmlInit() + +SPEAKER = True #(also change in ppo_trainer.py) +DEBUG = False + +@baseline_registry.register_trainer(name="ddppo") +class DDPPOTrainer(PPOTrainer): + # DD-PPO cuts rollouts short to mitigate the straggler effect + # This, in theory, can cause some rollouts to be very short. + # All rollouts contributed equally to the loss/model-update, + # thus very short rollouts can be problematic. This threshold + # limits the how short a short rollout can be as a fraction of the + # max rollout length + SHORT_ROLLOUT_THRESHOLD: float = 0.25 + + def __init__(self, config=None): + ''' + # for the time being stop using interupted state + # interrupted_state = load_interrupted_state() + if interrupted_state is not None: + config = interrupted_state["config"] + ''' + super().__init__(config) + self.max_dialog_len = 77 # to match the context ength of clip + self.vocab = Vocabulary() + self.vocab.load(vocab_path) + self.invalid_point_count = 0 + + def teacher_forcing_scheduler(self, update): + tf_ratio = 1.0 + if update > 15000: + tf_ratio = .7 + if update > 30000: + tf_ratio = .5 + return tf_ratio + + def _setup_actor_critic_agent(self, ppo_cfg: Config, observation_space=None) -> None: + r"""Sets up actor critic and agent for DD-PPO. + + Args: + ppo_cfg: config node with relevant params + + Returns: + None + """ + logger.add_filehandler(self.config.LOG_FILE) + action_space = self.envs.action_spaces[0] + + self.action_space = action_space + + has_distractor_sound = self.config.TASK_CONFIG.SIMULATOR.AUDIO.HAS_DISTRACTOR_SOUND + if ppo_cfg.policy_type == 'rnn': + self.actor_critic = AudioNavBaselinePolicy( + observation_space=self.envs.observation_spaces[0], + action_space=self.action_space, + hidden_size=ppo_cfg.hidden_size, + goal_sensor_uuid=self.config.TASK_CONFIG.TASK.GOAL_SENSOR_UUID, + extra_rgb=self.config.EXTRA_RGB, + use_mlp_state_encoder=ppo_cfg.use_mlp_state_encoder + ) + + if ppo_cfg.use_belief_predictor: + belief_cfg = ppo_cfg.BELIEF_PREDICTOR + bp_class = BeliefPredictorDDP if belief_cfg.online_training else BeliefPredictor + self.belief_predictor = bp_class(belief_cfg, self.device, None, None, + ppo_cfg.hidden_size, self.envs.num_envs, has_distractor_sound + ).to(device=self.device) + if belief_cfg.online_training: + params = list(self.belief_predictor.predictor.parameters()) + if belief_cfg.train_encoder: + params += list(self.actor_critic.net.goal_encoder.parameters()) + \ + list(self.actor_critic.net.visual_encoder.parameters()) + \ + list(self.actor_critic.net.action_encoder.parameters()) + self.belief_predictor.optimizer = torch.optim.Adam(params, lr=belief_cfg.lr) + self.belief_predictor.freeze_encoders() + + + elif ppo_cfg.policy_type == 'smt': + smt_cfg = ppo_cfg.SCENE_MEMORY_TRANSFORMER + belief_cfg = ppo_cfg.BELIEF_PREDICTOR + self.actor_critic = AudioNavSMTPolicy( + observation_space=self.envs.observation_spaces[0], + action_space=self.envs.action_spaces[0], + hidden_size=smt_cfg.hidden_size, + nhead=smt_cfg.nhead, + num_encoder_layers=smt_cfg.num_encoder_layers, + num_decoder_layers=smt_cfg.num_decoder_layers, + dropout=smt_cfg.dropout, + activation=smt_cfg.activation, + use_pretrained=smt_cfg.use_pretrained, + pretrained_path=smt_cfg.pretrained_path, + pretraining=smt_cfg.pretraining, + use_belief_encoding=smt_cfg.use_belief_encoding, + use_belief_as_goal=ppo_cfg.use_belief_predictor, + use_label_belief=belief_cfg.use_label_belief, + use_location_belief=belief_cfg.use_location_belief, + normalize_category_distribution=belief_cfg.normalize_category_distribution, + use_category_input=has_distractor_sound, + query_count_emb_size = self.config.QUERY_COUNT_EMB_SIZE, + ) + if smt_cfg.freeze_encoders: + self._static_smt_encoder = True + self.actor_critic.net.freeze_encoders() + + if ppo_cfg.use_belief_predictor: + smt = self.actor_critic.net.smt_state_encoder + bp_class = BeliefPredictorDDP if belief_cfg.online_training else BeliefPredictor + self.belief_predictor = bp_class(belief_cfg, self.device, smt._input_size, smt._pose_indices, + smt.hidden_state_size, self.envs.num_envs, has_distractor_sound + ).to(device=self.device) + if belief_cfg.online_training: + params = list(self.belief_predictor.predictor.parameters()) + if belief_cfg.train_encoder: + params += list(self.actor_critic.net.goal_encoder.parameters()) + \ + list(self.actor_critic.net.visual_encoder.parameters()) + \ + list(self.actor_critic.net.action_encoder.parameters()) + self.belief_predictor.optimizer = torch.optim.Adam(params, lr=belief_cfg.lr) + self.belief_predictor.freeze_encoders() + + # ----------------------------------------------------------------------------- + # add speaker module here + if SPEAKER: + self.speaker = Speaker(device=self.device) + # already sent to cuda and set in eval mode + + elif ppo_cfg.policy_type == 'dialog': + + smt_cfg = ppo_cfg.SCENE_MEMORY_TRANSFORMER + belief_cfg = ppo_cfg.BELIEF_PREDICTOR + # new for dialog based + self.actor_critic_vln = AudioNavDialogPolicy( + observation_space=self.envs.observation_spaces[0], + action_space=self.envs.action_spaces[0], + hidden_size=smt_cfg.hidden_size, + nhead=smt_cfg.nhead, + num_encoder_layers=smt_cfg.num_encoder_layers, + num_decoder_layers=smt_cfg.num_decoder_layers, + dropout=smt_cfg.dropout, + activation=smt_cfg.activation, + use_pretrained=smt_cfg.use_pretrained, + pretrained_path=smt_cfg.pretrained_path, + pretraining=smt_cfg.pretraining, + use_belief_encoding=smt_cfg.use_belief_encoding, + use_belief_as_goal=ppo_cfg.use_belief_predictor, + use_label_belief=belief_cfg.use_label_belief, + use_location_belief=belief_cfg.use_location_belief, + normalize_category_distribution=belief_cfg.normalize_category_distribution, + use_category_input=has_distractor_sound, + num_steps = self.config.NUM_DIALOG_STEPS, + ) + + if smt_cfg.freeze_encoders: + self._static_smt_encoder = True + self.actor_critic_vln.net.freeze_encoders() + + if ppo_cfg.use_belief_predictor: + smt = self.actor_critic_vln.net.smt_state_encoder + # check, only actor_critic_vln updated when belief predictor updated? + bp_class = BeliefPredictorDDP if belief_cfg.online_training else BeliefPredictor + self.belief_predictor = bp_class(belief_cfg, self.device, smt._input_size, smt._pose_indices, + smt.hidden_state_size, self.envs.num_envs, has_distractor_sound + ).to(device=self.device) + ''' + if belief_cfg.online_training: + params = list(self.belief_predictor.predictor.parameters()) + if belief_cfg.train_encoder: + params += list(self.actor_critic_vln.net.goal_encoder.parameters()) + \ + list(self.actor_critic_vln.net.visual_encoder.parameters()) + \ + list(self.actor_critic_vln.net.action_encoder.parameters()) + self.belief_predictor.optimizer = torch.optim.Adam(params, lr=belief_cfg.lr) + + self.belief_predictor.freeze_encoders() + ''' + # ----------------------------------------------------------------------------- + # add speaker module here + if SPEAKER: + self.speaker = Speaker(device=self.device) + # already sent to cuda and set in eval mode + + + else: + raise ValueError(f'Policy type {ppo_cfg.policy_type} is not defined!') + + + # edit----------------------- + self.actor_critic_vln.to(self.device) + + # load weights for both actor critic and the encoder + pretrained_state = torch.load(self.config.GOAL_CKPT_PATH, map_location="cpu") + self.actor_critic_vln.net.visual_encoder.rgb_encoder.load_state_dict( + { + k[len("actor_critic.net.visual_encoder.rgb_encoder."):]: v + for k, v in pretrained_state["state_dict"].items() + if "actor_critic.net.visual_encoder.rgb_encoder." in k + }, + ) + self.actor_critic_vln.net.visual_encoder.depth_encoder.load_state_dict( + { + k[len("actor_critic.net.visual_encoder.depth_encoder."):]: v + for k, v in pretrained_state["state_dict"].items() + if "actor_critic.net.visual_encoder.depth_encoder." in k + }, + ) + logger.info('visual encoder loaded') + + self.actor_critic_vln.net.goal_encoder.load_state_dict( + { + k[len("actor_critic.net.goal_encoder."):]: v + for k, v in pretrained_state['state_dict'].items() + if "actor_critic.net.goal_encoder." in k + }, + ) + logger.info('goal encoder loaded') + + self.actor_critic_vln.net.action_encoder.load_state_dict( + { + k[len("actor_critic.net.action_encoder."):]: v + for k, v in pretrained_state['state_dict'].items() + if "actor_critic.net.action_encoder." in k + }, + ) + logger.info('action_encoder loaded') + + self.belief_predictor.load_state_dict(pretrained_state["belief_predictor"]) + logger.info('belief_predictor loaded loaded') + + if self.config.RL.DDPPO.reset_critic: + nn.init.orthogonal_(self.actor_critic_vln.critic.fc.weight) + nn.init.constant_(self.actor_critic_vln.critic.fc.bias, 0) + + + self.agent = DDPPO( + actor_critic=self.actor_critic_vln, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + use_normalized_advantage=ppo_cfg.use_normalized_advantage, + ) + + + def _setup_actor_critic_agent_interactive(self, ppo_cfg: Config, observation_space=None) -> None: + logger.add_filehandler(self.config.LOG_FILE) + action_space = self.envs.action_spaces[0] + + self.action_space = action_space + has_distractor_sound = self.config.TASK_CONFIG.SIMULATOR.AUDIO.HAS_DISTRACTOR_SOUND + + if ppo_cfg.policy_type == 'interactive': + smt_cfg = ppo_cfg.SCENE_MEMORY_TRANSFORMER + belief_cfg = ppo_cfg.BELIEF_PREDICTOR + + self.actor_critic_goal = AudioNavSMTPolicy( + observation_space=self.envs.observation_spaces[0], + action_space=self.envs.action_spaces[0], + hidden_size=smt_cfg.hidden_size, + nhead=smt_cfg.nhead, + num_encoder_layers=smt_cfg.num_encoder_layers, + num_decoder_layers=smt_cfg.num_decoder_layers, + dropout=smt_cfg.dropout_goal, + activation=smt_cfg.activation, + use_pretrained=smt_cfg.use_pretrained, + pretrained_path=smt_cfg.pretrained_path, + pretraining=False, + use_belief_encoding=smt_cfg.use_belief_encoding, + use_belief_as_goal=ppo_cfg.use_belief_predictor, + use_label_belief=belief_cfg.use_label_belief, + use_location_belief=belief_cfg.use_location_belief, + normalize_category_distribution=belief_cfg.normalize_category_distribution, + use_category_input=has_distractor_sound, + ) + + self.actor_critic_vln = AudioNavDialogPolicy( + observation_space=self.envs.observation_spaces[0], + action_space=self.envs.action_spaces[0], + hidden_size=smt_cfg.hidden_size, + nhead=smt_cfg.nhead, + num_encoder_layers=smt_cfg.num_encoder_layers, + num_decoder_layers=smt_cfg.num_decoder_layers, + dropout=smt_cfg.dropout, + activation=smt_cfg.activation, + use_pretrained=smt_cfg.use_pretrained, + pretrained_path=smt_cfg.pretrained_path, + pretraining=False, + use_belief_encoding=smt_cfg.use_belief_encoding, + use_belief_as_goal=ppo_cfg.use_belief_predictor, + use_label_belief=belief_cfg.use_label_belief, + use_location_belief=belief_cfg.use_location_belief, + normalize_category_distribution=belief_cfg.normalize_category_distribution, + use_category_input=has_distractor_sound, + num_steps = self.config.NUM_DIALOG_STEPS, + ) + + # need a model for option policy + self.actor_critic_option = AudioNavOptionPolicy( + observation_space=self.envs.observation_spaces[0], + action_space=self.envs.action_spaces[0], + hidden_size=smt_cfg.hidden_size, + nhead=smt_cfg.nhead, + num_encoder_layers=smt_cfg.num_encoder_layers, + num_decoder_layers=smt_cfg.num_decoder_layers, + dropout=smt_cfg.dropout, + activation=smt_cfg.activation, + use_pretrained=smt_cfg.use_pretrained, + pretrained_path=smt_cfg.pretrained_path, + pretraining= smt_cfg.pretraining, + use_belief_encoding=smt_cfg.use_belief_encoding, + use_belief_as_goal=ppo_cfg.use_belief_predictor, + use_label_belief=belief_cfg.use_label_belief, + use_location_belief=belief_cfg.use_location_belief, + normalize_category_distribution=belief_cfg.normalize_category_distribution, + use_category_input=has_distractor_sound, + # num_steps = self.config.NUM_DIALOG_STEPS, + query_count_emb_size = self.config.QUERY_COUNT_EMB_SIZE, + ) + + if smt_cfg.freeze_encoders: + self._static_smt_encoder = True + self.actor_critic_goal.net.freeze_encoders() + + if ppo_cfg.use_belief_predictor: + smt = self.actor_critic_goal.net.smt_state_encoder # we can use smt_state_encoder of actor_critic_vln too + bp_class = BeliefPredictorDDP if belief_cfg.online_training else BeliefPredictor + self.belief_predictor = bp_class(belief_cfg, self.device, smt._input_size, smt._pose_indices, + smt.hidden_state_size, self.envs.num_envs, has_distractor_sound + ).to(device=self.device) + # not learning online + self.belief_predictor.freeze_encoders() + + # ----------------------------------------------------------------------------- + if SPEAKER: + self.speaker = Speaker(device=self.device) + + else: + raise ValueError(f'Policy type {ppo_cfg.policy_type} is not defined for this case!') + + self.actor_critic_goal.to(self.device) + self.actor_critic_vln.to(self.device) + self.actor_critic_option.to(self.device) + + # for vln + for name, param in self.actor_critic_vln.named_parameters(): + if 'net.clip' in name: + param.requires_grad = False + + ckpt_dict_vln = self.load_checkpoint(self.config.VLN_CKPT_PATH, map_location="cpu") + ckpt2load_vln = {} + for k,v in ckpt_dict_vln["state_dict"].items(): + if k.split('.')[0]=='actor_critic': + ckpt2load_vln['.'.join(k.split('.')[1:])] = v + self.actor_critic_vln.load_state_dict(ckpt2load_vln, strict=False) + + + # for goal based policy + for name, param in self.actor_critic_goal.named_parameters(): + param.requires_grad = False + ckpt_dict_goal = self.load_checkpoint(self.config.GOAL_CKPT_PATH, map_location="cpu") + + ckpt2load_goal = {} + for k,v in ckpt_dict_goal["state_dict"].items(): + if k.split('.')[0]=='actor_critic': + ckpt2load_goal['.'.join(k.split('.')[1:])] = v + for k,v in ckpt2load_goal.items(): + if k.split('.')[0] == 'action_distribution': + ckpt2load_goal['action_distribution_goal.'+'.'.join(k.split('.')[1:])] = v + del ckpt2load_goal[k] + if k.split('.')[0] == 'critic': + ckpt2load_goal['critic_goal.'+'.'.join(k.split('.')[1:])] = v + del ckpt2load_goal[k] + self.actor_critic_goal.load_state_dict(ckpt2load_goal, strict=False) + + if not DEBUG: + # loading encoders for option policy + self.actor_critic_option.net.visual_encoder.rgb_encoder.load_state_dict( + { + k[len("actor_critic.net.visual_encoder.rgb_encoder."):]: v + for k, v in ckpt_dict_goal['state_dict'].items() + if "actor_critic.net.visual_encoder.rgb_encoder." in k + }, + ) + self.actor_critic_option.net.visual_encoder.depth_encoder.load_state_dict( + { + k[len("actor_critic.net.visual_encoder.depth_encoder."):]: v + for k, v in ckpt_dict_goal['state_dict'].items() + if "actor_critic.net.visual_encoder.depth_encoder." in k + }, + ) + logger.info('visual_encoder loaded') + self.actor_critic_option.net.goal_encoder.load_state_dict( + { + k[len("actor_critic.net.goal_encoder."):]: v + for k, v in ckpt_dict_goal['state_dict'].items() + if "actor_critic.net.goal_encoder." in k + }, + ) + logger.info('goal_encoder loaded') + self.actor_critic_option.net.action_encoder.load_state_dict( + { + k[len("actor_critic.net.action_encoder."):]: v + for k, v in ckpt_dict_goal['state_dict'].items() + if "actor_critic.net.action_encoder." in k + }, + ) + logger.info('action_encoder loaded') + + + self.belief_predictor.load_state_dict(ckpt_dict_goal["belief_predictor"]) + logger.info('belief_predictor loaded') + + # releasing checkpoints + ckpt_dict_vln = None + ckpt2load_vln = None + ckpt_dict_goal = None + ckpt2load_goal = None + + if self.config.RL.DDPPO.reset_critic: + nn.init.orthogonal_(self.actor_critic_goal.critic.fc.weight) + nn.init.constant_(self.actor_critic_goal.critic.fc.bias, 0) + + + self.agent = DDPPO( + actor_critic=self.actor_critic_option, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + use_normalized_advantage=ppo_cfg.use_normalized_advantage, + ) + + self.agent_vln = DDPPO( + actor_critic=self.actor_critic_vln, + clip_param=ppo_cfg.clip_param, + ppo_epoch=ppo_cfg.ppo_epoch, + num_mini_batch=ppo_cfg.num_mini_batch, + value_loss_coef=ppo_cfg.value_loss_coef, + entropy_coef=ppo_cfg.entropy_coef, + lr=ppo_cfg.lr, + eps=ppo_cfg.eps, + max_grad_norm=ppo_cfg.max_grad_norm, + use_normalized_advantage=ppo_cfg.use_normalized_advantage, + ) + + with torch.no_grad(): + max_len = 1000 + position = torch.arange(max_len).unsqueeze(1) + div_term = torch.exp(torch.arange(0, self.config.QUERY_COUNT_EMB_SIZE, 2) * (-math.log(10000.0) / self.config.QUERY_COUNT_EMB_SIZE)) + self.pe = torch.zeros(max_len, self.config.QUERY_COUNT_EMB_SIZE) + self.pe[:, 0::2] = torch.sin(position * div_term) + self.pe[:, 1::2] = torch.cos(position * div_term) + + + def train(self) -> None: + r"""Main method for DD-PPO. + + Returns: + None + """ + + self.local_rank, tcp_store = init_distrib_slurm( + self.config.RL.DDPPO.distrib_backend + ) + + add_signal_handlers() + # Stores the number of workers that have finished their rollout + num_rollouts_done_store = distrib.PrefixStore( + "rollout_tracker", tcp_store + ) + num_rollouts_done_store.set("num_done", "0") + + self.world_rank = distrib.get_rank() + self.world_size = distrib.get_world_size() + + self.config.defrost() + self.config.TORCH_GPU_ID = self.local_rank + self.config.SIMULATOR_GPU_ID = self.local_rank + # Multiply by the number of simulators to make sure they also get unique seeds + self.config.TASK_CONFIG.SEED += ( + self.world_rank * self.config.NUM_PROCESSES + ) + + self.config.freeze() + + random.seed(self.config.TASK_CONFIG.SEED) + np.random.seed(self.config.TASK_CONFIG.SEED) + torch.manual_seed(self.config.TASK_CONFIG.SEED) + if torch.cuda.is_available(): + self.device = torch.device("cuda", self.local_rank) + torch.cuda.set_device(self.device) + else: + self.device = torch.device("cpu") + + # use the config savi_pretraining_dialog_training.yaml + # this config internally calls task config semantic_audiogoal_dialog.yaml + if self.config.DIALOG_TRAINING: + self.config.defrost() + # NUM_UPDATES should be set based on number of gpus + self.config.NUM_UPDATES = self.config.NUM_UPDATES_DIALOG + self.config.CHECKPOINT_INTERVAL = self.config.CHECKPOINT_INTERVAL_DIALOG + self.config.RL.PPO.num_steps = self.config.NUM_DIALOG_STEPS + self.config.RL.PPO.SCENE_MEMORY_TRANSFORMER.pretraining = False + self.config.freeze() + + # constructing env + self.envs = construct_envs( + self.config, get_env_class(self.config.ENV_NAME) + ) + ppo_cfg = self.config.RL.PPO + + if ( + not os.path.isdir(self.config.CHECKPOINT_FOLDER) + and self.world_rank == 0 + ): + os.makedirs(self.config.CHECKPOINT_FOLDER) + + if self.config.DIALOG_TRAINING: + self._setup_actor_critic_agent(ppo_cfg) + else: + self._setup_actor_critic_agent_interactive(ppo_cfg) + + self.agent.init_distributed(find_unused_params=True) + if not self.config.DIALOG_TRAINING: + self.agent_vln.init_distributed(find_unused_params=True) + + if ppo_cfg.use_belief_predictor and ppo_cfg.BELIEF_PREDICTOR.online_training: + self.belief_predictor.init_distributed(find_unused_params=True) + + if self.world_rank == 0: + logger.info( + "agent number of trainable parameters: {}".format( + sum( + param.numel() + for param in self.agent.parameters() + if param.requires_grad + ) + ) + ) + if not self.config.DIALOG_TRAINING: + logger.info( + "agent_vln number of trainable parameters: {}".format( + sum( + param.numel() + for param in self.agent_vln.parameters() + if param.requires_grad + ) + ) + ) + if ppo_cfg.use_belief_predictor: + logger.info( + "belief predictor number of trainable parameters: {}".format( + sum( + param.numel() + for param in self.belief_predictor.parameters() + if param.requires_grad + ) + ) + ) + logger.info(f"config: {self.config}") + + observations = self.envs.reset() + batch = batch_obs(observations, device=self.device) + obs_space = self.envs.observation_spaces[0] + + + if ppo_cfg.use_external_memory: + if not self.config.DIALOG_TRAINING: + memory_dim_option = self.actor_critic_option.net.memory_dim + memory_dim_goal = self.actor_critic_goal.net.memory_dim + memory_dim_vln = self.actor_critic_vln.net.memory_dim + + else: + memory_dim_option = self.actor_critic_vln.net.memory_dim + memory_dim_goal = self.actor_critic_vln.net.memory_dim + memory_dim_vln = self.actor_critic_vln.net.memory_dim + + else: + memory_dim_option = None + memory_dim_goal = None + memory_dim_vln = None + + if ppo_cfg.use_state_memory: + memory_dim_dialog = ppo_cfg.SCENE_MEMORY_TRANSFORMER.hidden_size + else: + memory_dim_dialog = None + + + rollouts = RolloutStorage( + ppo_cfg.num_steps, + self.envs.num_envs, + obs_space, + self.action_space, + ppo_cfg.hidden_size, + ppo_cfg.use_external_memory, + ppo_cfg.SCENE_MEMORY_TRANSFORMER.memory_size + ppo_cfg.num_steps, # for goal + ppo_cfg.SCENE_MEMORY_TRANSFORMER.memory_size, + ppo_cfg.SCENE_MEMORY_TRANSFORMER.memory_size + ppo_cfg.num_steps, # for query + ppo_cfg.SCENE_MEMORY_TRANSFORMER.memory_size, + self.config.NUM_DIALOG_STEPS, # for vln + self.config.NUM_DIALOG_STEPS, + memory_dim_goal, + memory_dim_vln, + memory_dim_option, + memory_dim_dialog, + num_recurrent_layers=self.actor_critic_vln.net.num_recurrent_layers, + max_dialog_len=self.max_dialog_len, + use_state_memory = ppo_cfg.use_state_memory, + ) + rollouts.to(self.device) + + + if not self.config.DIALOG_TRAINING and self.config.REPLAY_STORE: + + self.store_dict = { + 'batch': {}, + 'recurrent_hidden_states':[], + 'actions':[], + 'actions_option':[], + 'actions_log_probs_option':[], + 'values':[], + 'rewards':[], + 'masks': [], + 'masks_vln': [], + 'external_memory_features':[], + 'external_memory_dialog_features': [], + 'current_dialog': [], + 'o_action': [], + 'o_mask': [], + 'action_prob': [], + 'current_query_state': [], + 'current_agent_step': [], + } + + for key in batch.keys(): + self.store_dict['batch'][key]=[] + + self.replay_buffer = {idx:deepcopy(self.store_dict) for idx in range(self.config.NUM_PROCESSES)} + rollouts_vln = RolloutStorage( + self.config.NUM_DIALOG_STEPS, + self.envs.num_envs, + obs_space, + self.action_space, + ppo_cfg.hidden_size, + ppo_cfg.use_external_memory, + self.config.NUM_DIALOG_STEPS, + self.config.NUM_DIALOG_STEPS, + self.config.NUM_DIALOG_STEPS, + self.config.NUM_DIALOG_STEPS, + self.config.NUM_DIALOG_STEPS, # for vln + self.config.NUM_DIALOG_STEPS, + memory_dim_goal, + memory_dim_vln, + memory_dim_option, + memory_dim_dialog, + num_recurrent_layers=self.actor_critic_vln.net.num_recurrent_layers, + max_dialog_len=self.max_dialog_len, + use_state_memory = ppo_cfg.use_state_memory, + ) + rollouts_vln.to(self.device) + + + # ----------------------------------------- + # set up a dictionary for tracking when the query is triggered + track_query = {idx: {'dialog': [], 'step':0, 'queried': False, 'cons_reward': 0, 'last_query_step': 0, 'total_step': 0, 'all_step': [], 'all_reward': []} for idx in range(self.config.NUM_PROCESSES)} + track_query_count = {idx: 0 for idx in range(self.config.NUM_PROCESSES)} + + if self.config.RL.PPO.use_belief_predictor: + self.belief_predictor.update(batch, None) + + for sensor in rollouts.observations: + rollouts.observations[sensor][0].copy_(batch[sensor]) + + # batch and observations may contain shared PyTorch CUDA + # tensors. We must explicitly clear them here otherwise + # they will be kept in memory for the entire duration of training! + batch = None + observations = None + + current_episode_info = dict( + current_episode_reward = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_reward_goal = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_reward_vln = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_step_goal = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_step_vln = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_step_stat_goal = torch.zeros(self.envs.num_envs, 4, device=self.device), + current_episode_step_stat_vln = torch.zeros(self.envs.num_envs, 4, device=self.device), + current_episode_query_cnt_thresh = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_query_cnt_radius = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_1st_query = torch.zeros(self.envs.num_envs, 1, device=self.device), + current_episode_4th_query = torch.zeros(self.envs.num_envs, 1, device=self.device), + + ) + running_episode_stats = dict( + count=torch.zeros(self.envs.num_envs, 1, device=self.device), + reward=torch.zeros(self.envs.num_envs, 1, device=self.device), + reward_goal=torch.zeros(self.envs.num_envs, 1, device=self.device), + reward_vln=torch.zeros(self.envs.num_envs, 1, device=self.device), + query_count=torch.zeros(self.envs.num_envs, 1, device=self.device), + step_count=torch.zeros(self.envs.num_envs, 1, device=self.device), + forward_step_goal = torch.zeros(self.envs.num_envs, 1, device=self.device), + left_step_goal = torch.zeros(self.envs.num_envs, 1, device=self.device), + right_step_goal = torch.zeros(self.envs.num_envs, 1, device=self.device), + forward_step_vln = torch.zeros(self.envs.num_envs, 1, device=self.device), + left_step_vln = torch.zeros(self.envs.num_envs, 1, device=self.device), + right_step_vln = torch.zeros(self.envs.num_envs, 1, device=self.device), + step_count_goal = torch.zeros(self.envs.num_envs, 1, device=self.device), + step_count_vln = torch.zeros(self.envs.num_envs, 1, device=self.device), + query_count_thresh = torch.zeros(self.envs.num_envs, 1, device=self.device), + query_count_radius = torch.zeros(self.envs.num_envs, 1, device=self.device), + query_step_1st = torch.zeros(self.envs.num_envs, 1, device=self.device), + query_step_4th = torch.zeros(self.envs.num_envs, 1, device=self.device), + ) + window_episode_stats = defaultdict( + lambda: deque(maxlen=ppo_cfg.reward_window_size) + ) + + + t_start = time.time() + env_time = 0 + pth_time = 0 + count_steps = 0 + count_checkpoints = 0 + start_update = 0 + prev_time = 0 + replay_training_cnt = 0 + + lr_scheduler = LambdaLR( + optimizer=self.agent.optimizer, + lr_lambda=lambda x: linear_decay(x, self.config.NUM_UPDATES), + ) + + if self.config.DIALOG_TRAINING: + lr_scheduler_vln = CosineAnnealingLR(self.agent.dialog_optimizer, T_max=30, eta_min=.000001) + + if self.config.RESUME_CHECKPOINT: + # Try to resume at previous checkpoint (independent of interrupted states) + count_steps_start, count_checkpoints, start_update, replay_training_cnt, count_checkpoints_vln = self.try_to_resume_checkpoint() + count_steps = count_steps_start + + else: + count_steps_start = 0 + count_checkpoints = 0 + count_checkpoints_vln = 0 + start_update = 0 + + interrupted_state = load_interrupted_state() + assert interrupted_state is None, 'shouldnt start from interupted state' + + if interrupted_state is not None: + self.agent.load_state_dict(interrupted_state["state_dict"]) + if self.config.RL.PPO.use_belief_predictor: + self.belief_predictor.load_state_dict(interrupted_state["belief_predictor"]) + self.agent.optimizer.load_state_dict( + interrupted_state["optim_state"] + ) + lr_scheduler.load_state_dict(interrupted_state["lr_sched_state"]) + + requeue_stats = interrupted_state["requeue_stats"] + env_time = requeue_stats["env_time"] + pth_time = requeue_stats["pth_time"] + count_steps = requeue_stats["count_steps"] + count_checkpoints = requeue_stats["count_checkpoints"] + start_update = requeue_stats["start_update"] + prev_time = requeue_stats["prev_time"] + + with ( + TensorboardWriter( + self.config.TENSORBOARD_DIR, flush_secs=self.flush_secs + ) + if self.world_rank == 0 + else contextlib.suppress() + ) as writer: + for update in range(start_update, self.config.NUM_UPDATES): + if self.config.DIALOG_TRAINING: + lr_scheduler_vln.step() + + if ppo_cfg.use_linear_lr_decay: # False + lr_scheduler.step() + + if ppo_cfg.use_linear_clip_decay: # False + self.agent.clip_param = ppo_cfg.clip_param * linear_decay( + update, self.config.NUM_UPDATES + ) + + if EXIT.is_set(): + self.envs.close() + + if REQUEUE.is_set() and self.world_rank == 0: + requeue_stats = dict( + env_time=env_time, + pth_time=pth_time, + count_steps=count_steps, + count_checkpoints=count_checkpoints, + start_update=update, + prev_time=(time.time() - t_start) + prev_time, + ) + state_dict = dict( + state_dict=self.agent.state_dict(), + optim_state=self.agent.optimizer.state_dict(), + lr_sched_state=lr_scheduler.state_dict(), + config=self.config, + requeue_stats=requeue_stats, + ) + if self.config.RL.PPO.use_belief_predictor: + state_dict['belief_predictor'] = self.belief_predictor.state_dict() + save_interrupted_state(state_dict) + + requeue_job() + return + + count_steps_delta = 0 + self.agent.eval() + if not self.config.DIALOG_TRAINING: + self.agent_vln.eval() + self.actor_critic_goal.eval() + if self.config.RL.PPO.use_belief_predictor: + self.belief_predictor.eval() + + # dialog + if self.config.DIALOG_TRAINING: + o_actions = self.envs.compute_oracle_actions() + self.o_actions_updated = np.zeros((self.config.NUM_DIALOG_STEPS, self.config.NUM_PROCESSES)) + self.o_actions_mask = np.ones((self.config.NUM_DIALOG_STEPS, self.config.NUM_PROCESSES)) + + for process_idx in range(self.config.NUM_PROCESSES): + if len(o_actions[process_idx])> self.config.NUM_DIALOG_STEPS: + self.o_actions_updated[:, process_idx] = o_actions[process_idx][:self.config.NUM_DIALOG_STEPS] + else: + self.o_actions_updated[:len(o_actions[process_idx]), process_idx] = o_actions[process_idx] + self.o_actions_mask[len(o_actions[process_idx]-1):, process_idx] = 0 + + + for step in range(ppo_cfg.num_steps): + ( + delta_pth_time, + delta_env_time, + delta_steps, + track_query, + track_query_count, + replay_store + ) = self._collect_rollout_step( + rollouts, current_episode_info, running_episode_stats, + track_query, track_query_count, tf_ratio = self.teacher_forcing_scheduler(update) + ) + + if not self.config.DIALOG_TRAINING and self.config.REPLAY_STORE: + + self.assign_to_replay_buffer(replay_store) + storing_done = self.store_in_rollout(rollouts_vln) + + if storing_done: + + self.agent_vln.train() + ( + _, + ce_loss_replay, + ) = self._update_agent_vln(rollouts_vln) + # stats = torch.tensor( [ce_loss_replay], device=self.device) + # distrib.all_reduce(stats) + + + #if self.world_rank == 0: + # ce_loss_replay = ce_loss_replay #/ self.world_size #stats[0].item() / self.world_size + # logger.info("replay_training_cnt: {}, cross entropy loss: {}".format(replay_training_cnt, ce_loss_replay)), + + # checkpoint model + + if replay_training_cnt % self.config.CHECKPOINT_INTERVAL_DIALOG == 0: + self.save_checkpoint_vln( + f"vln/ckpt.{count_checkpoints_vln}.pth", + dict(step=replay_training_cnt), + ) + count_checkpoints_vln += 1 + + replay_training_cnt += 1 + + self.agent_vln.eval() + + + # ce_loss_replay = 0 + + else: + ce_loss_replay = 0 + + pth_time += delta_pth_time + env_time += delta_env_time + count_steps_delta += delta_steps + + # This is where the preemption of workers happens. If a + # worker detects it will be a straggler, it preempts itself! + if not self.config.DIALOG_TRAINING: + if ( + step + >= ppo_cfg.num_steps * self.SHORT_ROLLOUT_THRESHOLD + ) and int(num_rollouts_done_store.get("num_done")) > ( + self.config.RL.DDPPO.sync_frac * self.world_size + ): + break + + num_rollouts_done_store.add("num_done", 1) + + self.agent.train() + if self.config.RL.PPO.use_belief_predictor: + # self.belief_predictor.train() + self.belief_predictor.set_eval_encoders() + if self._static_smt_encoder: + if not self.config.DIALOG_TRAINING: + self.actor_critic_option.net.set_eval_encoders() + self.actor_critic_goal.net.set_eval_encoders() + self.actor_critic_vln.net.set_eval_encoders() + + else: + self.actor_critic_vln.net.set_eval_encoders() + + ''' + if ppo_cfg.use_belief_predictor and ppo_cfg.BELIEF_PREDICTOR.online_training: + location_predictor_loss, prediction_accuracy = self.train_belief_predictor(rollouts) + else: + ''' + location_predictor_loss = 0 + prediction_accuracy = 0 + + if self.config.DIALOG_TRAINING: + + ( + delta_pth_time, + ce_loss, + ) = self._update_agent_dialog(rollouts) + + pth_time += delta_pth_time + stats = torch.tensor( [ce_loss, count_steps_delta], device=self.device) + distrib.all_reduce(stats) + + observations = self.envs.reset() + batch = batch_obs(observations, device=self.device) + for sensor in rollouts.observations: + rollouts.observations[sensor][0].copy_(batch[sensor]) + if self.config.RL.PPO.use_belief_predictor: + self.belief_predictor.update(batch, None) + batch = None + observations = None + + if self.world_rank == 0: + num_rollouts_done_store.set("num_done", "0") + loss = stats[0].item() / self.world_size + count_steps += stats[1].item() + + writer.add_scalar("Policy/ce_loss", loss, count_steps) + + if update > 0 and update % self.config.LOG_INTERVAL == 0: + logger.info( + "update: {}\tfps: {:.3f}\t".format( + update, + (count_steps - count_steps_start) + / ((time.time() - t_start) + prev_time), + ) + ) + + logger.info( + "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" + "frames: {}".format( + update, env_time, pth_time, count_steps + ) + ) + logger.info('dialog training: {}, without_dialog: {}'.format(self.config.DIALOG_TRAINING, self.config.DIALOG_TRAINING_WITHOUT_DIALOG)) + logger.info("num_process: {}, weighted sequential cross entropy loss: {}".format(self.config.NUM_PROCESSES, loss)), + + + # checkpoint model + if update % self.config.CHECKPOINT_INTERVAL == 0: + self.save_checkpoint( + f"ckpt.{count_checkpoints}.pth", + dict(step=count_steps), + ) + count_checkpoints += 1 + + + if not self.config.DIALOG_TRAINING: + ( + delta_pth_time, + value_loss, + action_loss, + dist_entropy, + values_debug, return_batch_debug, + unct_loss, + ) = self._update_agent(ppo_cfg, rollouts) + + pth_time += delta_pth_time + + + + stats_ordering = list(sorted(running_episode_stats.keys())) + stats = torch.stack( + [running_episode_stats[k] for k in stats_ordering], 0 + ) + distrib.all_reduce(stats) + + for i, k in enumerate(stats_ordering): + window_episode_stats[k].append(stats[i].clone()) + + stats = torch.tensor( + [value_loss, action_loss, dist_entropy, location_predictor_loss, prediction_accuracy, count_steps_delta, ce_loss_replay, values_debug, return_batch_debug, unct_loss], + device=self.device, + ) + distrib.all_reduce(stats) + + count_steps += stats[5].item() + + if self.world_rank == 0: + num_rollouts_done_store.set("num_done", "0") + + losses = [ + stats[0].item() / self.world_size, + stats[1].item() / self.world_size, + stats[2].item() / self.world_size, + stats[3].item() / self.world_size, + stats[4].item() / self.world_size, + stats[6].item() / self.world_size, + stats[7].item() / self.world_size, + stats[8].item() / self.world_size, + stats[9].item() / self.world_size, + + ] + + + deltas = { + k: ( + (v[-1] - v[0]).sum().item() + if len(v) > 1 + else v[0].sum().item() + ) + for k, v in window_episode_stats.items() + + } + + deltas["count"] = max(deltas["count"], 1.0) + + writer.add_scalar( + "Metrics/reward", deltas["reward"] / deltas["count"], count_steps + ) + + metrics = { + k: v / deltas["count"] + for k, v in deltas.items() + if k not in {"reward", "count", 'reward_goal', 'reward_vln', 'query_count', 'step_count', 'forward_step_goal', 'left_step_goal', 'right_step_goal', 'forward_step_vln', 'left_step_vln', 'right_step_vln', 'step_count_goal', 'step_count_vln', 'query_count_thresh', 'query_count_radius', 'query_step_1st', 'query_step_4th'} + } + + if len(metrics) > 0: + for metric, value in metrics.items(): + writer.add_scalar(f"Metrics/{metric}", value, count_steps) + + # for debugging + writer.add_scalar("Debug/reward_goal", deltas['reward_goal']/max(deltas['step_count_goal'],1), count_steps) + writer.add_scalar("Debug/reward_vln", deltas['reward_vln']/max(deltas['step_count_vln'],1) , count_steps) + writer.add_scalar("Debug/window_query_ratio", deltas['query_count']/max(deltas['step_count'],1) , count_steps) + writer.add_scalar("Debug/forward_ratio_goal", deltas['forward_step_goal']/max(deltas['step_count_goal'],1), count_steps) + writer.add_scalar('Debug/left_ratio_goal', deltas['left_step_goal']/max(deltas['step_count_goal'],1), count_steps) + writer.add_scalar('Debug/right_ratio_goal', deltas['right_step_goal']/max(deltas['step_count_goal'],1), count_steps) + writer.add_scalar('Debug/forward_ratio_vln', deltas['forward_step_vln']/max(deltas['step_count_vln'],1), count_steps) + writer.add_scalar('Debug/left_ratio_vln', deltas['left_step_vln']/max(deltas['step_count_vln'],1), count_steps) + writer.add_scalar('Debug/right_ratio_vln', deltas['right_step_vln']/max(deltas['step_count_vln'],1), count_steps) + + + deltas_v2 = { + k: ( + (v[-1] - v[-2]).sum().item() + if len(v) > 1 + else v[0].sum().item() + ) + for k, v in window_episode_stats.items() if k in {'reward_goal', 'step_count_goal', 'reward_vln', 'step_count_vln', 'count', 'query_count', 'step_count', 'query_count_thresh', 'query_count_radius', 'query_step_1st', 'query_step_4th'} + } + ''' + writer.add_scalar("Debug/current_query_ratio", deltas_v2['query_count']/max(deltas_v2['step_count'],1), count_steps) + writer.add_scalar("Debug/current_query_ratio_thresh", deltas_v2['query_count_thresh']/max(deltas_v2['step_count'],1), count_steps) + writer.add_scalar("Debug/current_query_ratio_radius", deltas_v2['query_count_radius']/max(deltas_v2['step_count'],1), count_steps) + writer.add_scalar("Debug/current_query_step_1st", deltas_v2['query_step_1st']/max(deltas_v2['step_count'],1), count_steps) + writer.add_scalar("Debug/current_query_step_4th", deltas_v2['query_step_4th']/max(deltas_v2['step_count'],1), count_steps) + ''' + writer.add_scalar("Debug/current_reward_vln", deltas_v2['reward_vln']/max(deltas_v2['step_count_vln'],1) , count_steps) + writer.add_scalar("Debug/current_reward_goal", deltas_v2['reward_goal']/max(deltas_v2['step_count_goal'],1) , count_steps) + + writer.add_scalar("Debug/current_query", deltas_v2['query_count']/max(deltas_v2['count'],1), count_steps) + writer.add_scalar("Debug/current_query_thresh", deltas_v2['query_count_thresh']/max(deltas_v2['count'],1), count_steps) + writer.add_scalar("Debug/current_query_radius", deltas_v2['query_count_radius']/max(deltas_v2['count'],1), count_steps) + writer.add_scalar("Debug/current_query_step_1st", deltas_v2['query_step_1st']/max(deltas_v2['count'],1), count_steps) + writer.add_scalar("Debug/current_query_step_4th", deltas_v2['query_step_4th']/max(deltas_v2['count'],1), count_steps) + + + writer.add_scalar("Policy/value_loss", losses[0], count_steps) + writer.add_scalar("Policy/policy_loss", losses[1], count_steps) + writer.add_scalar("Policy/entropy_loss", losses[2], count_steps) + writer.add_scalar("Policy/predictor_loss", losses[3], count_steps) + writer.add_scalar("Policy/predictor_accuracy", losses[4], count_steps) + writer.add_scalar('Policy/learning_rate', lr_scheduler.get_lr()[0], count_steps) + writer.add_scalar('Policy/values', losses[6], count_steps) + writer.add_scalar('Policy/returns', losses[7], count_steps) + writer.add_scalar('Policy/unct_loss', losses[8], count_steps) + + # log stats + if update > 0 and update % self.config.LOG_INTERVAL == 0: + logger.info( + "update: {}\tfps: {:.3f}\t".format( + update, + (count_steps - count_steps_start) + / ((time.time() - t_start) + prev_time), + ) + ) + + logger.info( + "update: {}\tenv-time: {:.3f}s\tpth-time: {:.3f}s\t" + "frames: {}".format( + update, env_time, pth_time, count_steps + ) + ) + logger.info( + "Average window size: {} {}".format( + len(window_episode_stats["count"]), + " ".join( + "{}: {:.3f}".format(k, v / deltas["count"]) + for k, v in deltas.items() + if k != "count" + ), + ) + ) + logger.info("replay_training_cnt: {}, cross entropy loss: {}".format(replay_training_cnt, losses[5])), + + + + # checkpoint model + if update % self.config.CHECKPOINT_INTERVAL == 0: + self.save_checkpoint( + f"ckpt.{count_checkpoints}.pth", + dict(step=count_steps), + ) + count_checkpoints += 1 + torch.cuda.empty_cache() + + + self.envs.close() diff --git a/ss_baselines/savi/ddppo/policy/__init__.py b/ss_baselines/savi/ddppo/policy/__init__.py new file mode 100644 index 0000000..1af224d --- /dev/null +++ b/ss_baselines/savi/ddppo/policy/__init__.py @@ -0,0 +1 @@ +from .resnet_policy import AudioNavResNetPolicy \ No newline at end of file diff --git a/ss_baselines/savi/ddppo/policy/resnet.py b/ss_baselines/savi/ddppo/policy/resnet.py new file mode 100644 index 0000000..3986fd1 --- /dev/null +++ b/ss_baselines/savi/ddppo/policy/resnet.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + + +def conv3x3(in_planes, out_planes, stride=1, groups=1): + """3x3 convolution with padding""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + groups=groups, + ) + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution""" + return nn.Conv2d( + in_planes, out_planes, kernel_size=1, stride=stride, bias=False + ) + + +class BasicBlock(nn.Module): + expansion = 1 + resneXt = False + + def __init__( + self, + inplanes, + planes, + ngroups, + stride=1, + downsample=None, + cardinality=1, + ): + super(BasicBlock, self).__init__() + self.convs = nn.Sequential( + conv3x3(inplanes, planes, stride, groups=cardinality), + nn.GroupNorm(ngroups, planes), + nn.ReLU(True), + conv3x3(planes, planes, groups=cardinality), + nn.GroupNorm(ngroups, planes), + ) + self.downsample = downsample + self.relu = nn.ReLU(True) + + def forward(self, x): + residual = x + + out = self.convs(x) + + if self.downsample is not None: + residual = self.downsample(x) + + return self.relu(out + residual) + + +def _build_bottleneck_branch( + inplanes, planes, ngroups, stride, expansion, groups=1 +): + return nn.Sequential( + conv1x1(inplanes, planes), + nn.GroupNorm(ngroups, planes), + nn.ReLU(True), + conv3x3(planes, planes, stride, groups=groups), + nn.GroupNorm(ngroups, planes), + nn.ReLU(True), + conv1x1(planes, planes * expansion), + nn.GroupNorm(ngroups, planes * expansion), + ) + + +class SE(nn.Module): + def __init__(self, planes, r=16): + super().__init__() + self.squeeze = nn.AdaptiveAvgPool2d(1) + self.excite = nn.Sequential( + nn.Linear(planes, int(planes / r)), + nn.ReLU(True), + nn.Linear(int(planes / r), planes), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + x = self.squeeze(x) + x = x.view(b, c) + x = self.excite(x) + + return x.view(b, c, 1, 1) + + +def _build_se_branch(planes, r=16): + return SE(planes, r) + + +class Bottleneck(nn.Module): + expansion = 4 + resneXt = False + + def __init__( + self, + inplanes, + planes, + ngroups, + stride=1, + downsample=None, + cardinality=1, + ): + super().__init__() + self.convs = _build_bottleneck_branch( + inplanes, + planes, + ngroups, + stride, + self.expansion, + groups=cardinality, + ) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + def _impl(self, x): + identity = x + + out = self.convs(x) + + if self.downsample is not None: + identity = self.downsample(x) + + return self.relu(out + identity) + + def forward(self, x): + return self._impl(x) + + +class SEBottleneck(Bottleneck): + def __init__( + self, + inplanes, + planes, + ngroups, + stride=1, + downsample=None, + cardinality=1, + ): + super().__init__( + inplanes, planes, ngroups, stride, downsample, cardinality + ) + + self.se = _build_se_branch(planes * self.expansion) + + def _impl(self, x): + identity = x + + out = self.convs(x) + out = self.se(out) * out + + if self.downsample is not None: + identity = self.downsample(x) + + return self.relu(out + identity) + + +class SEResNeXtBottleneck(SEBottleneck): + expansion = 2 + resneXt = True + + +class ResNeXtBottleneck(Bottleneck): + expansion = 2 + resneXt = True + + +class ResNet(nn.Module): + def __init__( + self, in_channels, base_planes, ngroups, block, layers, cardinality=1 + ): + super(ResNet, self).__init__() + self.conv1 = nn.Sequential( + nn.Conv2d( + in_channels, + base_planes, + kernel_size=7, + stride=2, + padding=3, + bias=False, + ), + nn.GroupNorm(ngroups, base_planes), + nn.ReLU(True), + ) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.cardinality = cardinality + + self.inplanes = base_planes + if block.resneXt: + base_planes *= 2 + + self.layer1 = self._make_layer(block, ngroups, base_planes, layers[0]) + self.layer2 = self._make_layer( + block, ngroups, base_planes * 2, layers[1], stride=2 + ) + self.layer3 = self._make_layer( + block, ngroups, base_planes * 2 * 2, layers[2], stride=2 + ) + self.layer4 = self._make_layer( + block, ngroups, base_planes * 2 * 2 * 2, layers[3], stride=2 + ) + + self.final_channels = self.inplanes + self.final_spatial_compress = 1.0 / (2 ** 5) + + def _make_layer(self, block, ngroups, planes, blocks, stride=1): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + nn.GroupNorm(ngroups, planes * block.expansion), + ) + + layers = [] + layers.append( + block( + self.inplanes, + planes, + ngroups, + stride, + downsample, + cardinality=self.cardinality, + ) + ) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, ngroups)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.maxpool(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + return x + + +def resnet18(in_channels, base_planes, ngroups): + model = ResNet(in_channels, base_planes, ngroups, BasicBlock, [2, 2, 2, 2]) + + return model + + +def resnet50(in_channels, base_planes, ngroups): + model = ResNet(in_channels, base_planes, ngroups, Bottleneck, [3, 4, 6, 3]) + + return model + + +def resneXt50(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, + base_planes, + ngroups, + ResNeXtBottleneck, + [3, 4, 6, 3], + cardinality=int(base_planes / 2), + ) + + return model + + +def se_resnet50(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, base_planes, ngroups, SEBottleneck, [3, 4, 6, 3] + ) + + return model + + +def se_resneXt50(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, + base_planes, + ngroups, + SEResNeXtBottleneck, + [3, 4, 6, 3], + cardinality=int(base_planes / 2), + ) + + return model + + +def se_resneXt101(in_channels, base_planes, ngroups): + model = ResNet( + in_channels, + base_planes, + ngroups, + SEResNeXtBottleneck, + [3, 4, 23, 3], + cardinality=int(base_planes / 2), + ) + + return model diff --git a/ss_baselines/savi/ddppo/policy/resnet_policy.py b/ss_baselines/savi/ddppo/policy/resnet_policy.py new file mode 100644 index 0000000..75f9a73 --- /dev/null +++ b/ss_baselines/savi/ddppo/policy/resnet_policy.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from gym import spaces +import logging + +from ss_baselines.common.utils import Flatten, ResizeCenterCropper +from ss_baselines.savi.ddppo.policy import resnet +from ss_baselines.savi.ddppo.policy.running_mean_and_var import ( + RunningMeanAndVar, +) +from ss_baselines.av_nav.models.rnn_state_encoder import RNNStateEncoder +from ss_baselines.savi.ppo.policy import Net, Policy +from ss_baselines.savi.models.visual_cnn import VisualCNN +from ss_baselines.savi.models.audio_cnn import AudioCNN +from soundspaces.tasks.nav import PoseSensor, SpectrogramSensor, Category + + +class AudioNavResNetPolicy(Policy): + def __init__( + self, + observation_space, + action_space, + hidden_size=512, + num_recurrent_layers=2, + rnn_type="LSTM", + resnet_baseplanes=32, + backbone="resnet50", + normalize_visual_inputs=False, + obs_transform=ResizeCenterCropper(size=(256, 256)), + force_blind_policy=False, + use_category_input=False, + has_distractor_sound=False + ): + super().__init__( + AudioNavResNetNet( + observation_space=observation_space, + action_space=action_space, + hidden_size=hidden_size, + num_recurrent_layers=num_recurrent_layers, + rnn_type=rnn_type, + backbone=backbone, + resnet_baseplanes=resnet_baseplanes, + normalize_visual_inputs=normalize_visual_inputs, + obs_transform=obs_transform, + force_blind_policy=force_blind_policy, + use_category_input=use_category_input, + has_distractor_sound=has_distractor_sound + ), + action_space.n, + ) + + +class ResNetEncoder(nn.Module): + def __init__( + self, + observation_space, + baseplanes=32, + ngroups=32, + spatial_size=128, + make_backbone=None, + normalize_visual_inputs=False, + obs_transform=ResizeCenterCropper(size=(256, 256)), + ): + super().__init__() + + self.obs_transform = obs_transform + if self.obs_transform is not None: + observation_space = self.obs_transform.transform_observation_space( + observation_space + ) + + if "rgb" in observation_space.spaces: + self._n_input_rgb = observation_space.spaces["rgb"].shape[2] + spatial_size = observation_space.spaces["rgb"].shape[0] // 2 + else: + self._n_input_rgb = 0 + + if "depth" in observation_space.spaces: + self._n_input_depth = observation_space.spaces["depth"].shape[2] + spatial_size = observation_space.spaces["depth"].shape[0] // 2 + else: + self._n_input_depth = 0 + + if normalize_visual_inputs: + self.running_mean_and_var = RunningMeanAndVar( + self._n_input_depth + self._n_input_rgb + ) + else: + self.running_mean_and_var = nn.Sequential() + + if not self.is_blind: + input_channels = self._n_input_depth + self._n_input_rgb + self.backbone = make_backbone(input_channels, baseplanes, ngroups) + + final_spatial = int( + spatial_size * self.backbone.final_spatial_compress + ) + after_compression_flat_size = 2048 + num_compression_channels = int( + round(after_compression_flat_size / (final_spatial ** 2)) + ) + self.compression = nn.Sequential( + nn.Conv2d( + self.backbone.final_channels, + num_compression_channels, + kernel_size=3, + padding=1, + bias=False, + ), + nn.GroupNorm(1, num_compression_channels), + nn.ReLU(True), + ) + + self.output_shape = ( + num_compression_channels, + final_spatial, + final_spatial, + ) + + @property + def is_blind(self): + return self._n_input_rgb + self._n_input_depth == 0 + + def layer_init(self): + for layer in self.modules(): + if isinstance(layer, (nn.Conv2d, nn.Linear)): + nn.init.kaiming_normal_( + layer.weight, nn.init.calculate_gain("relu") + ) + if layer.bias is not None: + nn.init.constant_(layer.bias, val=0) + + def forward(self, observations): + if self.is_blind: + return None + + cnn_input = [] + if self._n_input_rgb > 0: + rgb_observations = observations["rgb"] + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + rgb_observations = rgb_observations.permute(0, 3, 1, 2) + rgb_observations = rgb_observations / 255.0 # normalize RGB + cnn_input.append(rgb_observations) + + if self._n_input_depth > 0: + depth_observations = observations["depth"] + + # permute tensor to dimension [BATCH x CHANNEL x HEIGHT X WIDTH] + depth_observations = depth_observations.permute(0, 3, 1, 2) + + cnn_input.append(depth_observations) + + if self.obs_transform: + cnn_input = [self.obs_transform(inp) for inp in cnn_input] + + x = torch.cat(cnn_input, dim=1) + x = F.avg_pool2d(x, 2) + + x = self.running_mean_and_var(x) + x = self.backbone(x) + x = self.compression(x) + return x + + +class AudioNavResNetNet(Net): + """Network which passes the input image through CNN and concatenates + goal vector with CNN's output and passes that through RNN. + """ + + def __init__( + self, + observation_space, + action_space, + hidden_size, + num_recurrent_layers, + rnn_type, + backbone, + resnet_baseplanes, + normalize_visual_inputs, + obs_transform=ResizeCenterCropper(size=(256, 256)), + force_blind_policy=False, + use_category_input=False, + has_distractor_sound=False + ): + super().__init__() + self._use_category_input = use_category_input + self._hidden_size = hidden_size + + self._is_continuous = False + if action_space.__class__.__name__ == "ActionSpace": + self.prev_action_embedding = nn.Embedding(action_space.n + 1, 32) + else: + self.prev_action_embedding = nn.Linear(action_space.shape[0] + 1, 32) + self._is_continuous = True + self._n_prev_action = 32 + rnn_input_size = self._n_prev_action + + if backbone == 'custom_resnet18': + # self.visual_encoder = SMTCNN(observation_space) + self.visual_encoder = VisualCNN(observation_space, hidden_size) + else: + self.visual_encoder = ResNetEncoder( + observation_space if not force_blind_policy else spaces.Dict({}), + baseplanes=resnet_baseplanes, + ngroups=resnet_baseplanes // 2, + make_backbone=getattr(resnet, backbone), + normalize_visual_inputs=normalize_visual_inputs, + obs_transform=obs_transform, + ) + if PoseSensor.cls_uuid in observation_space.spaces: + self.pose_encoder = nn.Linear(5, 16) + pose_feature_dims = 16 + rnn_input_size += pose_feature_dims + + if SpectrogramSensor.cls_uuid in observation_space.spaces: + self.audio_encoder = AudioCNN(observation_space, 128, SpectrogramSensor.cls_uuid, + has_distractor_sound=has_distractor_sound) + rnn_input_size += 128 + else: + logging.info("Input has no audio") + + if use_category_input: + rnn_input_size += 21 + + if not self.visual_encoder.is_blind: + self.visual_fc = nn.Sequential( + Flatten(), + nn.Linear( + np.prod(self.visual_encoder.output_shape), hidden_size + ), + nn.ReLU(True), + ) + + self.state_encoder = RNNStateEncoder( + (0 if self.is_blind else self._hidden_size) + rnn_input_size, + self._hidden_size, + rnn_type=rnn_type, + num_layers=num_recurrent_layers, + ) + + self.train() + + @property + def output_size(self): + return self._hidden_size + + @property + def is_blind(self): + return self.visual_encoder.is_blind + + @property + def num_recurrent_layers(self): + return self.state_encoder.num_recurrent_layers + + def forward(self, observations, rnn_hidden_states, prev_actions, masks, ext_memory, ext_memory_masks): + x = [] + if not self.is_blind: + if "visual_features" in observations: + visual_feats = observations["visual_features"] + else: + visual_feats = self.visual_encoder(observations) + + visual_feats = self.visual_fc(visual_feats) + x.append(visual_feats) + + if not self._is_continuous: + prev_actions = self.prev_action_embedding( + ((prev_actions.float() + 1) * masks).long().squeeze(dim=-1) + ) + else: + prev_actions = self.prev_action_embedding( + prev_actions.float() * masks + ) + x.append(prev_actions) + + if SpectrogramSensor.cls_uuid in observations: + x.append(self.audio_encoder(observations)) + + if PoseSensor.cls_uuid in observations: + pose_formatted = self._format_pose(observations[PoseSensor.cls_uuid]) + pose_encoded = self.pose_encoder(pose_formatted) + x.append(pose_encoded) + + if self._use_category_input: + x.append(observations[Category.cls_uuid]) + + x = torch.cat(x, dim=1) + x, rnn_hidden_states = self.state_encoder(x, rnn_hidden_states, masks) + ext_memory_feats = None + + return x, rnn_hidden_states, ext_memory_feats + + def _format_pose(self, pose): + """ + Args: + pose: (N, 4) Tensor containing x, y, heading, time + """ + x, y, theta, time = torch.unbind(pose, dim=1) + cos_theta, sin_theta = torch.cos(theta), torch.sin(theta) + e_time = torch.exp(-time) + formatted_pose = torch.stack([x, y, cos_theta, sin_theta, e_time], 1) + return formatted_pose + diff --git a/ss_baselines/savi/ddppo/policy/running_mean_and_var.py b/ss_baselines/savi/ddppo/policy/running_mean_and_var.py new file mode 100644 index 0000000..ef42449 --- /dev/null +++ b/ss_baselines/savi/ddppo/policy/running_mean_and_var.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 + +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: CC-BY-4.0 + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.distributed as distrib +import torch.nn as nn +import torch.nn.functional as F + + +class RunningMeanAndVar(nn.Module): + def __init__(self, n_channels): + super().__init__() + self.register_buffer("_mean", torch.zeros(1, n_channels, 1, 1)) + self.register_buffer("_var", torch.zeros(1, n_channels, 1, 1)) + self.register_buffer("_count", torch.zeros(())) + + self._distributed = distrib.is_initialized() + + def forward(self, x): + if self.training: + new_mean = F.adaptive_avg_pool2d(x, 1).sum(0, keepdim=True) + new_count = torch.full_like(self._count, x.size(0)) + + if self._distributed: + distrib.all_reduce(new_mean) + distrib.all_reduce(new_count) + + new_mean /= new_count + + new_var = F.adaptive_avg_pool2d((x - new_mean).pow(2), 1).sum( + 0, keepdim=True + ) + + if self._distributed: + distrib.all_reduce(new_var) + + # No - 1 on all the variance as the number of pixels + # seen over training is simply absurd, so it doesn't matter + new_var /= new_count + + m_a = self._var * (self._count) + m_b = new_var * (new_count) + M2 = ( + m_a + + m_b + + (new_mean - self._mean).pow(2) + * self._count + * new_count + / (self._count + new_count) + ) + + self._var = M2 / (self._count + new_count) + self._mean = (self._count * self._mean + new_count * new_mean) / ( + self._count + new_count + ) + + self._count += new_count + + stdev = torch.sqrt( + torch.max(self._var, torch.full_like(self._var, 1e-2)) + ) + return (x - self._mean) / stdev diff --git a/ss_baselines/savi/dialog/ques_gen/models/__init__.py b/ss_baselines/savi/dialog/ques_gen/models/__init__.py new file mode 100644 index 0000000..efc20d3 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/models/__init__.py @@ -0,0 +1,5 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +from .iq_vln import IQ_VLN diff --git a/ss_baselines/savi/dialog/ques_gen/models/base_rnn.py b/ss_baselines/savi/dialog/ques_gen/models/base_rnn.py new file mode 100644 index 0000000..067e287 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/models/base_rnn.py @@ -0,0 +1,50 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +""" A base class for RNN. """ + +import torch.nn as nn + + +class BaseRNN(nn.Module): + """Applies a multi-layer RNN to an input sequence. + + Note: + Do not use this class directly, use one of the sub classes. + + Inputs: ``*args``, ``**kwargs`` + - ``*args``: variable length argument list. + - ``**kwargs``: arbitrary keyword arguments. + + Attributes: + SYM_MASK: masking symbol + SYM_EOS: end-of-sequence symbol + """ + SYM_MASK = "MASK" + SYM_EOS = "EOS" + + def __init__(self, vocab_size, max_len, hidden_size, input_dropout_p, + dropout_p, n_layers, rnn_cell): + """Constructor for BaseRNN. + Args: + vocab_size (int): size of the vocabulary + max_len (int): maximum allowed length for the sequence to be processed + hidden_size (int): number of features in the hidden state `h` + input_dropout_p (float): dropout probability for the input sequence + dropout_p (float): dropout probability for the output sequence + n_layers (int): number of recurrent layers + rnn_cell (str): type of RNN cell (Eg. 'LSTM' , 'GRU') + """ + super(BaseRNN, self).__init__() + self.vocab_size = vocab_size + self.max_len = max_len + self.hidden_size = hidden_size + self.n_layers = n_layers + self.input_dropout_p = input_dropout_p + self.input_dropout = nn.Dropout(p=input_dropout_p) + self.rnn_cell = getattr(nn, rnn_cell.upper()) + self.dropout_p = dropout_p + + def forward(self, *args, **kwargs): + raise NotImplementedError() diff --git a/ss_baselines/savi/dialog/ques_gen/models/decoder_rnn.py b/ss_baselines/savi/dialog/ques_gen/models/decoder_rnn.py new file mode 100644 index 0000000..7504921 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/models/decoder_rnn.py @@ -0,0 +1,222 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +from torch.autograd import Variable + +import random +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import sys + +from .base_rnn import BaseRNN + + +class DecoderRNN(BaseRNN): + """Provides functionality for decoding in a seq2seq framework. + + Attributes: + KEY_LENGTH (str): key used to indicate a list representing lengths of + output sequences in `ret_dict` + KEY_SEQUENCE (str): key used to indicate a list of sequences in + `ret_dict`. + + Inputs: inputs, encoder_hidden, encoder_outputs, function, teacher_forcing_ratio + - **inputs** (batch, seq_len, input_size): list of sequences, whose length is the batch size and within which + each sequence is a list of token IDs. It is used for teacher forcing when provided. (default `None`) + - **encoder_hidden** (num_layers * num_directions, batch_size, hidden_size): tensor containing the features in the + hidden state `h` of encoder. Used as the initial hidden state of the decoder. (default `None`) + - **encoder_outputs** (batch, seq_len, hidden_size): tensor with containing the outputs of the encoder. + - **function** (torch.nn.Module): A function used to generate symbols from RNN hidden state + (default is `torch.nn.functional.log_softmax`). + - **teacher_forcing_ratio** (float): The probability that teacher forcing will be used. A random number is + drawn uniformly from 0-1 for every decoding token, and if the sample is smaller than the given value, + teacher forcing would be used (default is 0). + + Outputs: decoder_outputs, decoder_hidden, ret_dict + - **decoder_outputs** (seq_len, batch, vocab_size): list of tensors + with size (batch_size, vocab_size) containing the outputs of + the decoding function. + - **decoder_hidden** (num_layers * num_directions, batch, hidden_size): + tensor containing the last hidden state of the decoder. + - **ret_dict**: dictionary containing additional information as + follows { + *KEY_LENGTH* : list of integers representing lengths of + output sequences, + *KEY_SEQUENCE* : list of sequences, where each sequence + is a list of predicted token IDs + }. + """ + + KEY_LENGTH = 'length' + KEY_SEQUENCE = 'sequence' + + def __init__(self, vocab_size, max_len, hidden_size, input_size, + sos_id, eos_id, + n_layers=1, rnn_cell='lstm', bidirectional=False, + input_dropout_p=0, dropout_p=0, embedding=None): + """Constructor for DecoderRNN. + + Args: + vocab_size (int): size of the vocabulary + max_len (int): a maximum allowed length for the sequence to be processed + hidden_size (int): the number of features in the hidden state `h` + sos_id (int): index of the start of sentence symbol + eos_id (int): index of the end of sentence symbol + n_layers (int, optional): number of recurrent layers (default: 1) + rnn_cell (str, optional): type of RNN cell (default: gru) + bidirectional (bool, optional): if the encoder is bidirectional + (default False) + input_dropout_p (float, optional): dropout probability for the input + sequence (default: 0) + dropout_p (float, optional): dropout probability for the output sequence + (default: 0) + """ + super(DecoderRNN, self).__init__(vocab_size, max_len, hidden_size, + input_dropout_p, dropout_p, + n_layers, rnn_cell) + + self.bidirectional_encoder = bidirectional + self.rnn = self.rnn_cell(input_size, hidden_size, n_layers, batch_first=True, dropout=dropout_p) + + self.output_size = vocab_size + self.input_size = input_size + self.max_length = max_len + self.eos_id = eos_id + self.sos_id = sos_id + + self.init_input = None + + self.embedding = nn.Embedding(self.output_size, self.input_size) + if embedding is not None: + # self.embedding.weight = nn.Parameter(self.embedding, requires_grad=False) + self.embedding.weight.data = embedding + + self.bn = nn.BatchNorm1d(self.hidden_size) + self.relu = nn.ReLU() + self.out = nn.Linear(self.hidden_size, self.output_size) + self.init_weights() + + def init_weights(self): + """Initialize weights""" + self.embedding.weight.data.uniform_(-0.1, 0.1) + # nn.init.xavier_normal_(self.embedding.weight.data) + # self.out.weight.data.uniform_(-0.1, 0.1) + nn.init.xavier_normal_(self.out.weight.data) + # self.out.bias.data.fill_(0) + + def forward_step(self, input_var, hidden, encoder_outputs, function): + batch_size = input_var.size(0) + output_size = input_var.size(1) + embedded = self.embedding(input_var) + embedded = self.input_dropout(embedded) + + output, hidden = self.rnn(embedded, hidden) + predicted_softmax = function(self.out(output.contiguous().view(-1, self.hidden_size)), dim=1).view(batch_size, output_size, -1) + # predicted_softmax = self.out(output.contiguous().view(-1, self.hidden_size)).view(batch_size, output_size, -1) + # output = self.relu(self.bn(output.contiguous().view(-1, self.hidden_size))) + # predicted_softmax = self.out(output).view(batch_size, output_size, -1) + # predicted_softmax = function(output, dim=1).view(batch_size, output_size, -1) + + return predicted_softmax, hidden + + def forward(self, inputs=None, encoder_hidden=None, encoder_outputs=None, + function=F.log_softmax, teacher_forcing_ratio=0): + ret_dict = dict() + + inputs, batch_size, max_length = self._validate_args( + inputs, encoder_hidden, encoder_outputs, function, + teacher_forcing_ratio) + decoder_hidden = self._init_state(encoder_hidden) + + use_teacher_forcing = (True if random.random() < teacher_forcing_ratio + else False) + + decoder_outputs = [] + sequence_symbols = [] + lengths = np.array([max_length] * batch_size) + + def decode(step, step_output): + decoder_outputs.append(step_output) + symbols = decoder_outputs[-1].topk(1)[1] + sequence_symbols.append(symbols) + + eos_batches = symbols.data.eq(self.eos_id) + if eos_batches.dim() > 0: + eos_batches = eos_batches.cpu().view(-1).numpy() + update_idx = ((lengths > di) & eos_batches) != 0 + lengths[update_idx] = len(sequence_symbols) + return symbols + + # Manual unrolling is used to support random teacher forcing. + # If teacher_forcing_ratio is True or False instead of a probability, the unrolling can be done in graph + if use_teacher_forcing: + decoder_input = inputs[:, :-1] + decoder_output, decoder_hidden = self.forward_step( + decoder_input, decoder_hidden, encoder_outputs, + function=function) + + for di in range(decoder_output.size(1)): + step_output = decoder_output[:, di, :] + decode(di, step_output) + else: + decoder_input = inputs[:, 0].unsqueeze(1) + for di in range(max_length): + decoder_output, decoder_hidden = self.forward_step( + decoder_input, decoder_hidden, encoder_outputs, + function=function) + step_output = decoder_output.squeeze(1) + symbols = decode(di, step_output) + decoder_input = symbols + + ret_dict[DecoderRNN.KEY_SEQUENCE] = sequence_symbols + ret_dict[DecoderRNN.KEY_LENGTH] = lengths.tolist() + + return decoder_outputs, decoder_hidden, ret_dict + + def _init_state(self, encoder_hidden): + """ Initialize the encoder hidden state. """ + if encoder_hidden is None: + return None + if isinstance(encoder_hidden, tuple): + encoder_hidden = tuple([self._cat_directions(h) for h in encoder_hidden]) + else: + encoder_hidden = self._cat_directions(encoder_hidden) + return encoder_hidden + + def _cat_directions(self, h): + """ If the encoder is bidirectional, do the following transformation. + (#directions * #layers, #batch, hidden_size) -> (#layers, #batch, #directions * hidden_size) + """ + if self.bidirectional_encoder: + h = torch.cat([h[0:h.size(0):2], h[1:h.size(0):2]], 2) + return h + + def _validate_args(self, inputs, encoder_hidden, encoder_outputs, function, teacher_forcing_ratio): + # inference batch size + if inputs is None and encoder_hidden is None: + batch_size = 1 + else: + if inputs is not None: + batch_size = inputs.size(0) + else: + if self.rnn_cell is nn.LSTM: + batch_size = encoder_hidden[0].size(1) + elif self.rnn_cell is nn.GRU: + batch_size = encoder_hidden.size(1) + + # set default input and max decoding length + if inputs is None: + if teacher_forcing_ratio > 0: + raise ValueError("Teacher forcing has to be disabled (set 0) when no inputs is provided.") + inputs = Variable(torch.LongTensor( + [self.sos_id] * batch_size)).view(batch_size, 1) + if torch.cuda.is_available(): + inputs = inputs.cuda() + max_length = self.max_length + else: + max_length = inputs.size(1) - 1 # minus the start of sequence symbol + + return inputs, batch_size, max_length diff --git a/ss_baselines/savi/dialog/ques_gen/models/encoder_cnn.py b/ss_baselines/savi/dialog/ques_gen/models/encoder_cnn.py new file mode 100644 index 0000000..6263d1c --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/models/encoder_cnn.py @@ -0,0 +1,39 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +"""Genearates a representation for an image input. +""" + +import torch.nn as nn +import torchvision.models as models + + +class EncoderCNN(nn.Module): + """Generates a representation for an image input. + """ + + def __init__(self, output_size): + """Load the pretrained ResNet-152 and replace top fc layer. + """ + super(EncoderCNN, self).__init__() + self.cnn = models.resnet18(pretrained=True) + for param in self.cnn.parameters(): + param.requires_grad = False + self.cnn.fc = nn.Linear(self.cnn.fc.in_features, output_size) + self.relu = nn.ReLU() + # self.bn = nn.BatchNorm1d(output_size, momentum=0.01) + self.init_weights() + + def init_weights(self): + + # self.cnn.fc.weight.data.normal_(0.0, 0.02) + nn.init.xavier_normal_(self.cnn.fc.weight.data) + # self.cnn.fc.bias.data.fill_(0) + + def forward(self, images): + + features = self.relu(self.cnn(images)) + return features + # output = self.bn(features) + # return output diff --git a/ss_baselines/savi/dialog/ques_gen/models/encoder_rnn.py b/ss_baselines/savi/dialog/ques_gen/models/encoder_rnn.py new file mode 100644 index 0000000..9d7c41c --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/models/encoder_rnn.py @@ -0,0 +1,91 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +import torch.nn as nn + +from .base_rnn import BaseRNN + + +class EncoderRNN(BaseRNN): + """Applies a multi-layer RNN to an input sequence. + + Inputs: inputs, input_lengths + - **inputs**: List of sequences, whose length is the batch size + and within which each sequence is a list of token IDs. + - **input_lengths** (list of int, optional): List that contains + the lengths of sequences in the mini-batch, it must be + provided when using variable length RNN (default: `None`). + + Outputs: output, hidden + - **output** (batch, seq_len, hidden_size): Tensor containing the + encoded features of the input sequence + - **hidden** (num_layers * num_directions, batch, hidden_size): Tensor + containing the features in the hidden state `h` + + Examples:: + >>> encoder = EncoderRNN(input_vocab, max_seq_length, hidden_size) + >>> output, hidden = encoder(input) + """ + + def __init__(self, vocab_size, max_len, hidden_size, + input_dropout_p=0, dropout_p=0, n_layers=1, + bidirectional=False, rnn_cell='lstm', variable_lengths=False): + """Constructor for EncoderRNN. + + Args: + vocab_size (int): Size of the vocabulary. + max_len (int): A maximum allowed length for the sequence to be + processed. + hidden_size (int): The number of features in the hidden state `h`. + input_dropout_p (float, optional): Dropout probability for the input + sequence (default: 0). + dropout_p (float, optional): Dropout probability for the output + sequence (default: 0). + n_layers (int, optional): Number of recurrent layers (default: 1). + bidirectional (bool, optional): if True, becomes a bidirectional + encoder (defulat False). + rnn_cell (str, optional): Type of RNN cell (default: gru). + variable_lengths (bool, optional): If use variable length + RNN (default: False). + """ + super(EncoderRNN, self).__init__(vocab_size, max_len, hidden_size, + input_dropout_p, dropout_p, n_layers, rnn_cell) + self.variable_lengths = variable_lengths + self.embedding = nn.Embedding(vocab_size, hidden_size) + self.rnn = self.rnn_cell(hidden_size, hidden_size, n_layers, + batch_first=True, bidirectional=bidirectional, + dropout=dropout_p) + self.init_weights() + + def init_weights(self): + """Initialize weights. + """ + self.embedding.weight.data.uniform_(-0.1, 0.1) + + def forward(self, input_var, input_lengths=None, h0=None): + """Applies a multi-layer RNN to an input sequence. + + Args: + input_var (batch, seq_len): Tensor containing the features of + the input sequence. + input_lengths (list of int, optional): A list that contains + the lengths of sequences in the mini-batch. + h0 : Tensor containing initial hidden state. + + Returns: output, hidden + - **output** (batch, seq_len, hidden_size): Variable containing + the encoded features of the input sequence + - **hidden** (num_layers * num_directions, batch, hidden_size): + Variable containing the features in the hidden state h + """ + embedded = self.embedding(input_var) + embedded = self.input_dropout(embedded) + if self.variable_lengths: + embedded = nn.utils.rnn.pack_padded_sequence( + embedded, input_lengths, batch_first=True) + output, hidden = self.rnn(embedded, h0) + if self.variable_lengths: + output, _ = nn.utils.rnn.pad_packed_sequence( + output, batch_first=True) + return output, hidden diff --git a/ss_baselines/savi/dialog/ques_gen/models/iq_vln.py b/ss_baselines/savi/dialog/ques_gen/models/iq_vln.py new file mode 100644 index 0000000..f8ba175 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/models/iq_vln.py @@ -0,0 +1,203 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: MIT + +"""Contains code for the IQ model. +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .encoder_cnn import EncoderCNN +from .decoder_rnn import DecoderRNN +from .mlp import MLP +import sys + + +class IQ_VLN(nn.Module): + """Information Maximization question generation. + """ + def __init__(self, vocab_size, max_len, hidden_size, embedding_dim, + sos_id, eos_id, num_layers=1, rnn_cell='LSTM', + bidirectional=False, input_dropout_p=0, dropout_p=0, + encoder_max_len=None, num_att_layers=2, att_ff_size=512, + embedding=None, z_size=20, no_answer_recon=False, + no_image_recon=False, no_category_space=False): + """Constructor for IQ. + + Args: + vocab_size: Number of words in the vocabulary. + max_len: The maximum length of the answers we generate. + hidden_size: Number of dimensions of RNN hidden cell. + sos_id: Vocab id for . + eos_id: Vocab id for . + num_layers: The number of layers of the RNNs. + rnn_cell: LSTM or RNN or GRU. + bidirectional: Whether the RNN is bidirectional. + input_dropout_p: Dropout applied to the input question words. + dropout_p: Dropout applied internally between RNN steps. + encoder_max_len: Maximum length of encoder. + num_att_layers: Number of stacked attention layers. + att_ff_size: Dimensions of stacked attention. + embedding (vocab_size, hidden_size): Tensor of embeddings or + None. If None, embeddings are learned. + z_size: Dimensions of noise epsilon. + """ + super(IQ_VLN, self).__init__() + self.hidden_size = hidden_size + if encoder_max_len is None: + encoder_max_len = max_len + self.num_layers = num_layers + + # Setup image encoder. + self.encoder_cnn = EncoderCNN(hidden_size) + self.image_proj = MLP(hidden_size, att_ff_size, hidden_size, + num_layers=num_att_layers) + # self.bn = nn.BatchNorm1d(self.hidden_size) + self.decoder = DecoderRNN(vocab_size, max_len, hidden_size, embedding_dim, + sos_id=sos_id, + eos_id=eos_id, + n_layers=num_layers, + rnn_cell=rnn_cell, + input_dropout_p=input_dropout_p, + dropout_p=dropout_p, + embedding=embedding) + + + # needed + def flatten_parameters(self): + if hasattr(self, 'decoder'): + self.decoder.rnn.flatten_parameters() + if hasattr(self, 'encoder'): + self.encoder.rnn.flatten_parameters() + + + # needed + def generator_parameters(self): + params = self.parameters() + params = filter(lambda p: p.requires_grad, params) + return params + + + # needed + def modify_hidden(self, func, hidden, rnn_cell): + """Applies the function func to the hidden representation. + This method is useful because some RNNs like LSTMs have a tuples. + Args: + func: A function to apply to the hidden representation. + hidden: A RNN (or LSTM or GRU) representation. + rnn_cell: One of RNN, LSTM or GRU. + Returns: + func(hidden). + """ + if rnn_cell is nn.LSTM: + return (func(hidden[0]), func(hidden[1])) + return func(hidden) + + + # needed + def encode_images(self, images): + """Encodes images. + Args: + images: Batch of image Tensors. + Returns: + Batch of image features. + """ + images = self.encoder_cnn(images) + images = self.image_proj(images) + # images = self.bn(images) + return images + + + # needed + def decode_questions(self, image_features, questions=None, + teacher_forcing_ratio=0, decode_function=F.log_softmax): + """Decodes the question from the latent space. + Args: + image_features: Batch of image features. + questions: Batch of question Variables. + teacher_forcing_ratio: Whether to predict with teacher forcing. + decode_function: What to use when choosing a word from the + distribution over the vocabulary. + """ + batch_size = image_features.size(0) + + # Reshape encoder_hidden (NUM_LAYERS * N * HIDDEN_SIZE). + hiddens = image_features.view((1, batch_size, self.hidden_size)) + hiddens = hiddens.expand((self.num_layers, batch_size, + self.hidden_size)).contiguous() + if self.decoder.rnn_cell is nn.LSTM: + hiddens = (hiddens, hiddens) + result = self.decoder(inputs=questions, + encoder_hidden=hiddens, + function=decode_function, + teacher_forcing_ratio=teacher_forcing_ratio) + return result + + ''' + # not needed + def forward(self, images, questions=None, + teacher_forcing_ratio=0, decode_function=F.log_softmax): + """Passes the image and the question through a model and generates answers. + Args: + images: Batch of image Variables. + questions: Batch of question Variables. + teacher_forcing_ratio: Whether to predict with teacher forcing. + decode_function: What to use when choosing a word from the + distribution over the vocabulary. + Returns: + - outputs: The output scores for all steps in the RNN. + - hidden: The hidden states of all the RNNs. + - ret_dict: A dictionary of attributes. See DecoderRNN.py for details. + """ + # features is (N * HIDDEN_SIZE) + image_features = self.encode_images(images) + result = self.decode_questions(image_features, questions=questions, + decode_function=decode_function, + teacher_forcing_ratio=teacher_forcing_ratio) + + return result + ''' + + + def parse_outputs_to_tokens(self, outputs): + """Converts model outputs to tokens. + Args: + outputs: Model outputs. + Returns: + A tensor of batch_size X max_len. + """ + # Take argmax for each timestep + # Output is list of MAX_LEN containing BATCH_SIZE * VOCAB_SIZE. + + # BATCH_SIZE * VOCAB_SIZE -> BATCH_SIZE + # outputs = [o.max(1)[1] for o in outputs] + + # sanity check + outputs = [o.max(1)[1] for o in outputs] + + outputs = torch.stack(outputs) # Tensor(max_len, batch) + outputs = outputs.transpose(0, 1) # Tensor(batch, max_len) + return outputs + + + def predict_from_image(self, images, questions=None, teacher_forcing_ratio=0, + decode_function=F.log_softmax): + """Outputs the predicted vocab tokens for the answers in a minibatch. + Args: + images: Batch of image Tensors. + teacher_forcing_ratio: Whether to predict with teacher forcing. + decode_function: What to use when choosing a word from the + distribution over the vocabulary. + Returns: + A tensor with BATCH_SIZE X MAX_LEN where each element is the index + into the vocab word. + """ + image_features = self.encode_images(images) + outputs, _, _ = self.decode_questions(image_features, questions=questions, + decode_function=decode_function, + teacher_forcing_ratio=teacher_forcing_ratio) + return self.parse_outputs_to_tokens(outputs) diff --git a/ss_baselines/savi/dialog/ques_gen/models/mlp.py b/ss_baselines/savi/dialog/ques_gen/models/mlp.py new file mode 100644 index 0000000..c27b841 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/models/mlp.py @@ -0,0 +1,61 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +"""A simple MLP. +""" + +from collections import OrderedDict +from torch import nn + +import math + + +class MLP(nn.Module): + """A simple MLP. + """ + + def __init__(self, input_size, hidden_size, num_classes, + num_layers=1, dropout_p=0.0): + """Constructor for MLP. + + Args: + input_size: The number of input dimensions. + hidden_size: The number of hidden dimensions for each layer. + num_classes: The size of the output. + num_layers: The number of hidden layers. + dropout_p: Dropout probability. + """ + super(MLP, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.num_classes = num_classes + layers = [] + for i in range(num_layers): + idim = hidden_size + odim = hidden_size + if i == 0: + idim = input_size + if i == num_layers-1: + odim = num_classes + fc = nn.Linear(idim, odim) + # fc.weight.data.normal_(0.0, math.sqrt(2. / idim)) + nn.init.xavier_normal_(fc.weight.data) + # fc.bias.data.fill_(0) + layers.append(('fc'+str(i), fc)) + if i != num_layers-1: + layers.append(('relu'+str(i), nn.ReLU())) + layers.append(('dropout'+str(i), nn.Dropout(p=dropout_p))) + self.layers = nn.Sequential(OrderedDict(layers)) + + def params_to_train(self): + return self.layers.parameters() + + def forward(self, x): + """Propagate through all the hidden layers. + + Args: + x: Input of self.input_size dimensions. + """ + out = self.layers(x) + return out diff --git a/ss_baselines/savi/dialog/ques_gen/processed/vocab_iq_vln.json b/ss_baselines/savi/dialog/ques_gen/processed/vocab_iq_vln.json new file mode 100755 index 0000000..a0d75a7 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/processed/vocab_iq_vln.json @@ -0,0 +1 @@ +{"word2idx": {"": 0, "": 1, "": 2, "": 3, "": 4, "?": 5, "circle": 6, "bins": 7, "console": 8, "standing": 9, "radio": 10, "rails": 11, "arriving": 12, "seeing": 13, "yellow": 14, "hallway": 15, "trashcan": 16, "bulletin": 17, "rear": 18, "giant": 19, "could": 20, "yourself": 21, "billiard": 22, "portraits": 23, "watermelon": 24, "upward": 25, "patterned": 26, "christmas": 27, "sixth": 28, "dishwasher": 29, "arm": 30, "3": 31, "floral": 32, "ovens": 33, "copy": 34, "it": 35, "visible": 36, "doorways": 37, "switches": 38, "reach": 39, "crate": 40, "photos": 41, "vessel": 42, "counter": 43, "player": 44, "remain": 45, "handrail": 46, "hardwood": 47, "intersection": 48, "bamboo": 49, "flights": 50, "lined": 51, "can": 52, "business": 53, "wall": 54, "entryway": 55, "still": 56, "jog": 57, "may": 58, "comforter": 59, "talk": 60, "exit": 61, "drum": 62, "across": 63, "slot": 64, "human": 65, "cord": 66, "mats": 67, "elevators": 68, "o'clock": 69, "chalk": 70, "doorknob": 71, "begins": 72, "recreation": 73, "marble-topped": 74, "dividing": 75, "backgammon": 76, "pallet": 77, "pavement": 78, "tub": 79, "zebras": 80, "completely": 81, "orchids": 82, "sharp": 83, "abstract": 84, "upon": 85, "sauna": 86, "arc": 87, "color": 88, "one": 89, "candy": 90, "finally": 91, "person": 92, "text": 93, "signs": 94, "bag": 95, "wal": 96, "curved": 97, "stained": 98, "pieces": 99, "stool": 100, "pointed": 101, "opening": 102, "prints": 103, "indoors": 104, "turn": 105, "indoor": 106, "gated": 107, "pail": 108, "straight": 109, "prior": 110, "suite": 111, "fiction": 112, "drop": 113, "the": 114, "ocean": 115, "upstairs": 116, "tubes": 117, "concrete": 118, "items": 119, "sailboats": 120, "paving": 121, "remaining": 122, "eventually": 123, "below": 124, "workout": 125, "your": 126, "shelves": 127, "sofas": 128, "different": 129, "got": 130, "orange": 131, "n't": 132, "beds": 133, "keyboard": 134, "branch": 135, "petition": 136, "grain": 137, "pair": 138, "silver": 139, "multiple": 140, "brown": 141, "foyer": 142, "tiling": 143, "trunk": 144, "stanchions": 145, "driveway": 146, "brick": 147, "four": 148, "make": 149, "stiars": 150, "314": 151, "boiler": 152, "to": 153, "each": 154, "an": 155, "for": 156, "fruit": 157, "throw": 158, "bottom": 159, "baluster": 160, "160": 161, "upholstered": 162, "infront": 163, "home": 164, "arrive": 165, "6th": 166, "gate": 167, "table": 168, "split": 169, "exits": 170, "topped": 171, "plastic": 172, "stools/chairs": 173, "on": 174, "hear": 175, "heads": 176, "dryer": 177, "wrought": 178, "nearby": 179, "attic": 180, "walk": 181, "going": 182, "slight": 183, "magazine": 184, "bedroom": 185, "easle": 186, "180": 187, "extremely": 188, "golden": 189, "altar": 190, "adjacent": 191, "stairway": 192, "as": 193, "potted": 194, "curtains": 195, "poster": 196, "shag": 197, "part": 198, "cigar": 199, "barred": 200, "only": 201, "jar": 202, "guest": 203, "wand": 204, "cushions": 205, "length": 206, "third": 207, "lake": 208, ")": 209, "planter": 210, "out": 211, "darker": 212, "stuffed": 213, "basketball": 214, "pots": 215, "gallery": 216, "coach": 217, "clothes": 218, "neighboring": 219, "artwork": 220, "down": 221, "any": 222, "flag": 223, "entrance": 224, "''": 225, "ca": 226, "rectangular": 227, "air": 228, "houseplant": 229, "cars": 230, "go": 231, "sailboat": 232, "structure": 233, "degree": 234, "359": 235, "hard": 236, "words": 237, "shallow": 238, "paining": 239, "entertainment": 240, "appears": 241, "teal": 242, "floor": 243, "dispenser": 244, "drumset": 245, "s-c": 246, "shoes": 247, "slab": 248, "bedrooms": 249, "chest": 250, "lime": 251, "kids": 252, "fork": 253, "front": 254, "marilyn": 255, "exited": 256, "blanket": 257, "outdoor": 258, "towel": 259, "shrub": 260, "bay": 261, "barstools": 262, "statues": 263, "barroom": 264, "case": 265, "cases": 266, "dividers": 267, "immediate": 268, "mosaic": 269, "perfume": 270, "beyond": 271, "loveseat": 272, "made": 273, "bushes": 274, "facing": 275, "upwards": 276, "meters": 277, "jam": 278, "lower": 279, "backside": 280, "lounger": 281, "bookcases": 282, "stools": 283, "copier": 284, "waterfall": 285, "new": 286, "fourth": 287, "little": 288, "shrubbery": 289, "bookshelf": 290, "sides": 291, "half": 292, "gravel": 293, "thru": 294, "grandfather": 295, "media": 296, "detector": 297, "lights": 298, "decorated": 299, "2nd": 300, "log": 301, "mantel": 302, "turning": 303, "bbq": 304, "cabinet": 305, "placed": 306, "shelving": 307, "edge": 308, "plants": 309, ".": 310, "alcove": 311, "equipment": 312, "gone": 313, "screened": 314, "walk-in": 315, "bags": 316, "stove": 317, "fish": 318, "marble": 319, "hamper": 320, "again": 321, "placemat": 322, "almost": 323, "checkerboard": 324, "net": 325, "extra": 326, "have": 327, "garage": 328, "not": 329, "position": 330, "piece": 331, "pgi": 332, "ladder": 333, "niche": 334, "dead": 335, "were": 336, "heading": 337, "atrium": 338, "just": 339, "shaped": 340, "ignore": 341, "victorian": 342, "double-doors": 343, "ahead": 344, "passing": 345, "sculptures": 346, "arcade": 347, "inside": 348, "they": 349, "move": 350, "partition": 351, "plug": 352, "faucets": 353, "cushioned": 354, "flight": 355, "bird": 356, "stops": 357, "shelf": 358, "rung": 359, "splits": 360, "rod": 361, "open": 362, "if": 363, "bin": 364, "twelve": 365, ".go": 366, "tan": 367, "breezeway": 368, "drip": 369, "empty": 370, "lamps": 371, "please": 372, "mirror": 373, "photographs": 374, "reddish": 375, "turns": 376, "mid": 377, "hand": 378, "grill": 379, "few": 380, "glasses": 381, "organ": 382, "chars": 383, "ninety": 384, "marked": 385, "encounter": 386, "spot": 387, "dishes": 388, "couple": 389, "reenter": 390, "support": 391, "base": 392, "vending": 393, "stones": 394, "handicap": 395, "central": 396, "barn": 397, "continute": 398, "series": 399, "bananas": 400, "flanked": 401, "nerolie": 402, "ottomans": 403, "weird": 404, "pas": 405, "desks": 406, "stopping": 407, "narrow": 408, "entryways": 409, "mini": 410, "fixture": 411, "leaving": 412, "couch": 413, "work": 414, "above": 415, "leather": 416, "seat": 417, "bars": 418, "image": 419, "tanning": 420, "twelfth": 421, "bathtub": 422, "rock": 423, "heater": 424, "reaching": 425, "fireplace": 426, "countertop": 427, "banisters": 428, "music": 429, "landing": 430, "'": 431, "draped": 432, "ottoman": 433, "round": 434, "unit": 435, "house": 436, "rope": 437, "last": 438, "left-hand": 439, "racks": 440, "02": 441, "chandeliers": 442, "steel": 443, "urns": 444, "bottles": 445, "teddy": 446, "find": 447, "brass": 448, "well": 449, "hundred": 450, "staff": 451, "winding": 452, "bare": 453, "reading": 454, "green": 455, "posters": 456, "drawers": 457, "leave": 458, "motion": 459, "target": 460, "foot": 461, "pillow": 462, "basket": 463, "these": 464, "5": 465, "most": 466, "wicker": 467, "carpeted": 468, "lighted": 469, "continue": 470, "mantle": 471, "outdoors": 472, "pole": 473, "even": 474, "t": 475, "thermostat": 476, "full": 477, "trapdoor": 478, "letters": 479, "breakfast": 480, "mirrored": 481, "pathway": 482, "main": 483, "located": 484, "do": 485, "globe": 486, "foward": 487, "projector": 488, "nook": 489, "flower": 490, "people": 491, "spiral": 492, "immediately": 493, "similar": 494, "weight": 495, "treadmill": 496, "many": 497, "digital": 498, "veranda": 499, "fountain": 500, "cinema": 501, "gold": 502, "scenes": 503, "blue": 504, "hit": 505, "locker": 506, "instead": 507, "blinds": 508, "garden": 509, "th": 510, "mountain": 511, "drinking": 512, "beige": 513, "hangings": 514, "ways": 515, "sitting": 516, "space": 517, "lobby": 518, "bouquet": 519, "reception": 520, "degress": 521, "stool/chairs": 522, "easel": 523, "dresser": 524, "tot": 525, "division": 526, "bicycle": 527, "stacked": 528, "20": 529, "promptly": 530, "game": 531, "butler": 532, "rolls": 533, "aluminum": 534, "following": 535, "lips": 536, "pot": 537, ",": 538, "about": 539, "unfinished": 540, "enter": 541, "piano": 542, "banquet": 543, "radiator": 544, "chair": 545, "printed": 546, "tile": 547, "filled": 548, "design": 549, "'s": 550, "china": 551, "bathroom": 552, "4th": 553, "level": 554, "candle": 555, "wine": 556, "exterior": 557, "view": 558, "rightside": 559, "longer": 560, "fan": 561, "low": 562, "alter": 563, "rugs": 564, "meter": 565, "family": 566, "tubs": 567, "lampshade": 568, "residence": 569, "so": 570, "striaght": 571, "grated": 572, "navigate": 573, "hammock": 574, "tables": 575, "bathrooms": 576, "through": 577, "washroom": 578, "patchwork": 579, "stairwell": 580, "11th": 581, "display": 582, "wat": 583, "rows": 584, "divider": 585, "now": 586, "under": 587, "rack": 588, "master": 589, "same": 590, "hexagon": 591, "o": 592, "road": 593, "becomes": 594, "circular": 595, "perpendicular": 596, "tray": 597, "hug": 598, "ship": 599, "area": 600, "garbage": 601, "hallways": 602, "bast": 603, "arches": 604, "restrooms": 605, "rights": 606, "trash": 607, "screen": 608, "bunch": 609, "guitars": 610, "drinks": 611, "clear": 612, "cellar": 613, "pattern": 614, "tablecloth": 615, "approach": 616, "entry": 617, "veering": 618, "recliner": 619, "lefts": 620, "traveling": 621, "hall": 622, "nice": 623, "body": 624, "instruments": 625, "bit": 626, "common": 627, "desk": 628, "tanks": 629, "triptych": 630, "building": 631, "handle": 632, "final": 633, "antelope": 634, "doormat": 635, "old": 636, "he": 637, "extreme": 638, "fur": 639, "heating": 640, "by": 641, "security": 642, "overhead": 643, "which": 644, "passed": 645, "leaf": 646, "1": 647, "cushion": 648, "beach": 649, "keys": 650, "post": 651, "path": 652, "den": 653, "bed": 654, "staying": 655, "vaulted": 656, "outlet": 657, "making": 658, "s": 659, "floors": 660, "tree": 661, "rocks": 662, "when": 663, "st": 664, "doorway": 665, "available": 666, "reached": 667, "meeting": 668, "bathtubs": 669, "urinals": 670, "line": 671, "chandelier": 672, "son": 673, "monitor": 674, "via": 675, "triangle": 676, "revolving": 677, "direction": 678, "dirt": 679, "downwards": 680, "pillar": 681, "glass": 682, "ends": 683, "enterance": 684, "painted": 685, "diagonally": 686, "fabric": 687, "railing": 688, "dots": 689, "prayer": 690, "overlooking": 691, "guitarist": 692, "rooms": 693, "than": 694, "walkthrough": 695, "upper": 696, "re-enter": 697, "animals": 698, "hang": 699, "passageway": 700, "fixtures": 701, "door": 702, "river": 703, "possible": 704, "sidewalk": 705, "dressing": 706, "kitchenette": 707, "smoke": 708, "bottle": 709, "keeping": 710, "stalls": 711, "strait": 712, "chessboard": 713, "island": 714, "swinging": 715, "twenty": 716, "angle": 717, "screens": 718, "theater": 719, "towels": 720, "plant": 721, "object": 722, "contains": 723, "lattice": 724, "there": 725, "other": 726, "velvet": 727, "bookshelves": 728, "counters": 729, "windows": 730, "office": 731, "sofa": 732, "man": 733, "love": 734, "leftmost": 735, "wrap": 736, "rotate": 737, "cylindrical": 738, "arrived": 739, "chairs": 740, "palm": 741, ";": 742, "follow": 743, "this": 744, "thin": 745, "say": 746, "whiteboard": 747, "rather": 748, "ping-pong": 749, "food": 750, "entire": 751, "should": 752, "while": 753, "bridge": 754, "holes": 755, "machine": 756, "enerskin": 757, "goes": 758, "built": 759, "stone": 760, "rotary": 761, "books": 762, "burgundy": 763, "powder": 764, "or": 765, "horse": 766, "animal": 767, "settee": 768, "frame": 769, "light": 770, "snail": 771, "whose": 772, "armchairs": 773, "flowers": 774, "nine": 775, "need": 776, "lots": 777, "flowered": 778, "handles": 779, "able": 780, "climb": 781, "deck": 782, "kitchen": 783, "finished": 784, "sized": 785, "quickly": 786, "bath": 787, "wiat": 788, "fire": 789, "tops": 790, "awning": 791, "stepping": 792, "squared": 793, "double-doored": 794, "more": 795, "break": 796, "group": 797, "chaise": 798, "crosses": 799, "throuhg": 800, "trieste": 801, "style": 802, "eight": 803, "344": 804, "feather": 805, "ropes": 806, "meet": 807, "water": 808, "corners": 809, "three": 810, "lines": 811, "curve": 812, "hollow": 813, "middle": 814, "sing": 815, "up": 816, "candelabra": 817, "apples": 818, "before": 819, "everything": 820, "paneled": 821, "tusks": 822, "windowed": 823, "fireplaces": 824, "gym": 825, "climbing": 826, "lamp": 827, "wide": 828, "siting": 829, "want": 830, "arrow": 831, "l-shaped": 832, "in": 833, "bush": 834, "phone": 835, "done": 836, "skinny": 837, "beam": 838, "tall": 839, "unicycle": 840, "travel": 841, "columns": 842, "urn": 843, "saloon": 844, "would": 845, "remainder": 846, "girl": 847, "rustic": 848, "seating": 849, "beside": 850, "theatre": 851, "halls": 852, "buffet": 853, "board": 854, "six": 855, "45": 856, "canisters": 857, "ascending": 858, "mirrors": 859, "pantry": 860, "though": 861, "feet": 862, "destination": 863, "metal": 864, "makeup": 865, "welcome": 866, "rail": 867, "several": 868, "jacuzzi": 869, "cupboards": 870, "big": 871, "descend": 872, "slightly": 873, "leads": 874, "pool": 875, "stick": 876, "panel": 877, "pong": 878, "station": 879, "vanity": 880, "inset": 881, "stationary": 882, "approximately": 883, "either": 884, "pulp": 885, "massive": 886, "'ll": 887, "build": 888, "very": 889, "baskets": 890, "graphic": 891, "statute": 892, "bigger": 893, "smaller": 894, "fern": 895, "use": 896, "indoor/outdoor": 897, "sets": 898, "face": 899, "art": 900, "foosball": 901, "vine": 902, "hockey": 903, "lighting": 904, "television": 905, "lead": 906, "street": 907, "liquor": 908, "word": 909, "oval": 910, "pillows": 911, "curtain": 912, "barbeque": 913, "drawing": 914, "exiting": 915, "backyard": 916, "locate": 917, "past": 918, "wooden": 919, "instructions": 920, "stepped": 921, "plaque": 922, "court": 923, "dinner": 924, "stall": 925, "ground": 926, "thorough": 927, "hands": 928, "refrigerator": 929, "headed": 930, "mat": 931, "cane": 932, "cooler": 933, "lady": 934, "takes": 935, "set": 936, "closets": 937, "like": 938, "handicapped": 939, "pews": 940, "woman": 941, "wait": 942, "saxophone": 943, "beauty": 944, "stars": 945, "scene": 946, "walking": 947, "wheelchair": 948, "dinging": 949, "froward": 950, "followed": 951, "come": 952, "shape": 953, "deer": 954, "lounging": 955, "drawings": 956, "behind": 957, "exercise": 958, "forwards": 959, "sconce": 960, "stand": 961, "leading": 962, "panes": 963, "pivot": 964, "does": 965, "armoire": 966, "farm": 967, "railings": 968, "went": 969, "french": 970, "proceed": 971, "coat": 972, "larger": 973, "rice": 974, "plate": 975, "bike": 976, "treadmills": 977, "ornate": 978, "beams": 979, "looks": 980, "int": 981, "loungers": 982, "striped": 983, "tiles": 984, "motorcycle": 985, "chimney": 986, "thought": 987, "hot": 988, "towards": 989, "closest": 990, "tennis": 991, "arched": 992, "gray": 993, "moment": 994, "closet/bathroom": 995, "basement": 996, "hutch": 997, "seats": 998, "photo": 999, "cover": 1000, "closet": 1001, "bench": 1002, "areas": 1003, "from": 1004, "minibar": 1005, "sculpture": 1006, "onto": 1007, "slider": 1008, "drawer": 1009, "sliding": 1010, "chevron": 1011, "checkered": 1012, "doorframe": 1013, "thirty": 1014, "nest": 1015, "labeled": 1016, "rightmost": 1017, "bear": 1018, "host": 1019, "showers": 1020, "flooring": 1021, "dual": 1022, "getting": 1023, "ping": 1024, "single": 1025, "starting": 1026, "finish": 1027, "soon": 1028, "throught": 1029, "wooded": 1030, "buildings": 1031, "sunset": 1032, "wall-mounted": 1033, "stop": 1034, "immediatly": 1035, "walls": 1036, "padded": 1037, "movie": 1038, "print": 1039, "ceramic": 1040, "corridor": 1041, "granite": 1042, "runner": 1043, "tight": 1044, "einstein": 1045, "waiting": 1046, "entranceway": 1047, "some": 1048, "restaurant": 1049, "roof": 1050, "another": 1051, "red": 1052, "popcorn": 1053, "paper": 1054, "mermaid": 1055, "eighty": 1056, "holding": 1057, "flat": 1058, "ones": 1059, "armchair": 1060, "paneling": 1061, "counting": 1062, "them": 1063, "is": 1064, "crossing": 1065, "computer": 1066, "frosted": 1067, "paces": 1068, "boy": 1069, "labelled": 1070, "hyena": 1071, "further": 1072, "ached": 1073, "continuing": 1074, "'ve": 1075, "two": 1076, "native": 1077, "rafter": 1078, "living": 1079, "barbecue": 1080, "change": 1081, "around": 1082, "sheets": 1083, "has": 1084, "elevator": 1085, "appliances": 1086, "swimming": 1087, "ad": 1088, "oven": 1089, "library": 1090, "its": 1091, "distance": 1092, "openings": 1093, "chair/stools": 1094, "peices": 1095, "long": 1096, "chrome": 1097, "solid": 1098, "window": 1099, "wardrobe": 1100, "step": 1101, "breaker": 1102, "rectangle": 1103, "rest": 1104, "halt": 1105, "end": 1106, "threshold": 1107, "closed": 1108, "taking": 1109, "panels": 1110, "run": 1111, "thing": 1112, "pictures": 1113, "pink": 1114, "construction": 1115, "veer": 1116, "nightstand": 1117, "walks": 1118, "closer": 1119, "pipes": 1120, "dot": 1121, "banister": 1122, "framed": 1123, "-": 1124, "women": 1125, "walkway": 1126, "high": 1127, "staircase": 1128, "furniture": 1129, "attached": 1130, "cream": 1131, "grate": 1132, "moving": 1133, "cement": 1134, "enclosure": 1135, "mini-fridge": 1136, "decorative": 1137, "plaques": 1138, "cylinder": 1139, "landscape": 1140, "downtown": 1141, "peice": 1142, "bistro": 1143, "diamond": 1144, "pipe": 1145, "underneath": 1146, "aqua": 1147, "barrel": 1148, "you": 1149, "furthest": 1150, "opp": 1151, "purple": 1152, "looking": 1153, "beautiful": 1154, "wires": 1155, "then": 1156, "bowl": 1157, "tapestry": 1158, "large": 1159, "off": 1160, "daybed": 1161, "lounge": 1162, "camera": 1163, "interior": 1164, "scale": 1165, "'re": 1166, "materials": 1167, "being": 1168, "folding": 1169, "bookcase": 1170, "11": 1171, "playroom": 1172, "along": 1173, "over": 1174, "beneath": 1175, "hose": 1176, "iron": 1177, "are": 1178, "wastebasket": 1179, "e": 1180, "cacti": 1181, "guardrail": 1182, "terrace": 1183, "pendant": 1184, "centerpiece": 1185, "clock": 1186, "archway": 1187, "a": 1188, "after": 1189, "gilt": 1190, "right": 1191, "says": 1192, "surrounding": 1193, "reclining": 1194, "point": 1195, "bannister": 1196, "collection": 1197, "shoe": 1198, "chaises": 1199, "curtained": 1200, "colorful": 1201, "spaces": 1202, "guitar": 1203, "portrait": 1204, "balcony": 1205, "entering": 1206, "339": 1207, "fence": 1208, "meets": 1209, "containing": 1210, "ascend": 1211, "kitchen/living": 1212, "monroe": 1213, "vases": 1214, "barbers": 1215, "extinguisher": 1216, "polka": 1217, "credenza": 1218, "ladies": 1219, "sink": 1220, "circles": 1221, "both": 1222, "candles": 1223, "left": 1224, "benches": 1225, "our": 1226, "horizontal": 1227, "elephant": 1228, "toward": 1229, "written": 1230, "planters": 1231, "mural": 1232, "men": 1233, "formal": 1234, "bar": 1235, "current": 1236, "sword": 1237, "started": 1238, "whole": 1239, "3rd": 1240, "giraffe": 1241, "walled": 1242, "cathedral": 1243, "those": 1244, "double": 1245, "cow": 1246, "changes": 1247, "wallpaper": 1248, "take": 1249, "til": 1250, "cheetah": 1251, "column": 1252, "section": 1253, "ledge": 1254, "fridge": 1255, "bust": 1256, "maroon": 1257, "downstairs": 1258, "homes": 1259, "foreward": 1260, "statue": 1261, "where": 1262, "fifth": 1263, "urinal": 1264, "switch": 1265, "salon": 1266, "begin": 1267, "pointing": 1268, "with": 1269, "bridal": 1270, "showing": 1271, "toilet": 1272, "crab": 1273, "shop": 1274, "tiny": 1275, "pass": 1276, "lit": 1277, "things": 1278, "serving": 1279, "tea": 1280, "skin": 1281, "boards": 1282, "next": 1283, "oil": 1284, "herringbone": 1285, "plates": 1286, "small": 1287, "wing": 1288, "printer": 1289, "boat": 1290, "entered": 1291, "seven": 1292, "here": 1293, "once": 1294, "start": 1295, "orient": 1296, "carpet": 1297, "enclosed": 1298, "pocket": 1299, "dome": 1300, "will": 1301, "pew": 1302, "no": 1303, "zebra": 1304, "canvas": 1305, "runs": 1306, "having": 1307, "platform": 1308, "raised": 1309, "all": 1310, "loft": 1311, "stoves": 1312, "shower": 1313, "greyish": 1314, "star": 1315, "cardboard": 1316, "table-tennis": 1317, "stay": 1318, "laundry": 1319, "``": 1320, "impact": 1321, "store": 1322, "char": 1323, "mounted": 1324, "check": 1325, "geometric": 1326, "tank": 1327, "bedside": 1328, "staircases": 1329, "degrees": 1330, "electrical": 1331, "look": 1332, "head": 1333, "row": 1334, "frames": 1335, "buddha": 1336, "archways": 1337, "roped": 1338, "square": 1339, "futon": 1340, "en": 1341, "dogleg": 1342, "box": 1343, "decoration": 1344, "floored": 1345, "#": 1346, "fake": 1347, "black": 1348, "metre": 1349, "opportunity": 1350, "goright": 1351, "rocking": 1352, "metallic": 1353, "massage": 1354, "6": 1355, "receptionist": 1356, "of": 1357, "jump": 1358, "sit": 1359, "parallel": 1360, "rug": 1361, "midway": 1362, "opens": 1363, "halfway": 1364, "fitness": 1365, "4": 1366, "let": 1367, "pause": 1368, "class": 1369, "be": 1370, "avoid": 1371, "sticking": 1372, "pig": 1373, "shuttered": 1374, "book": 1375, "forward": 1376, "that": 1377, "glass-topped": 1378, "seventh": 1379, "top": 1380, "boxes": 1381, "great": 1382, "hanging": 1383, "ventilation": 1384, "number": 1385, "conference": 1386, "utility": 1387, "sign": 1388, "tun": 1389, "pane": 1390, "background": 1391, "block": 1392, "telephone": 1393, "plywood": 1394, "curio": 1395, "machines": 1396, "gazebo": 1397, "workbench": 1398, "quarter": 1399, "exist": 1400, "advance": 1401, "microwave": 1402, "bathing": 1403, "cabinets": 1404, "fancy": 1405, "walked": 1406, "pillars": 1407, "separating": 1408, "workroom": 1409, "setting": 1410, "ballroom": 1411, "turk": 1412, "odd-shaped": 1413, "grey": 1414, "grand": 1415, "stairways": 1416, "ten": 1417, "oriental": 1418, "drums": 1419, "get": 1420, "away": 1421, "outside": 1422, "billiards": 1423, "decorations": 1424, "cross": 1425, "american": 1426, "hub": 1427, "opened": 1428, "entrances": 1429, "l": 1430, "cactus": 1431, "wet": 1432, "lockers": 1433, "church": 1434, "doorstep": 1435, "intersecting": 1436, "stage": 1437, "vertical": 1438, "diagonal": 1439, "woven": 1440, "video": 1441, "ski": 1442, "center": 1443, "arch": 1444, "sharply": 1445, "at": 1446, "vertically": 1447, "cloth": 1448, "anymore": 1449, "fountains": 1450, "porch": 1451, "rounded": 1452, "birdcage": 1453, "first": 1454, "carpeting": 1455, "minifridge": 1456, "vase": 1457, "goodfellas": 1458, "plank": 1459, "dark": 1460, "vent": 1461, "storage": 1462, "antechamber": 1463, "forty-five": 1464, "instruction": 1465, "dressers": 1466, "aside": 1467, "tv": 1468, "alongside": 1469, "boulders": 1470, "into": 1471, "descending": 1472, "barber": 1473, "posts": 1474, "wreath": 1475, "cowhide": 1476, "direclty": 1477, "curving": 1478, "projection": 1479, "dinning": 1480, "u": 1481, "second": 1482, "farthest": 1483, "twin": 1484, "trees": 1485, "passage": 1486, "keep": 1487, "skylight": 1488, "covered": 1489, "close": 1490, "lion": 1491, "steps": 1492, "grass": 1493, "doors": 1494, "put": 1495, "spa": 1496, "wood": 1497, "arrangement": 1498, "sectional": 1499, "stair": 1500, "adjoining": 1501, "hat": 1502, "faucet": 1503, "handing": 1504, "room": 1505, "corner": 1506, "wheel": 1507, "till": 1508, "swing": 1509, "lawn": 1510, "extinguishers": 1511, "side": 1512, "colored": 1513, "between": 1514, "marbled": 1515, "changing": 1516, "curves": 1517, "pedestal": 1518, "ball": 1519, "hats": 1520, "read": 1521, "identical": 1522, "ceiling": 1523, "quick": 1524, "really": 1525, "paintings": 1526, "supplies": 1527, "original": 1528, "washer/dryer": 1529, "plan": 1530, "dining": 1531, "washer": 1532, "aisle": 1533, "sun": 1534, "semi": 1535, "far": 1536, "footstool": 1537, "goal": 1538, "patio": 1539, "way": 1540, "reverse": 1541, "directly": 1542, "wooden/glass": 1543, "too": 1544, "until": 1545, "sunken": 1546, "painting": 1547, "coffee": 1548, "see": 1549, "intersects": 1550, "picture": 1551, "their": 1552, "without": 1553, "huge": 1554, "restroom": 1555, "nearest": 1556, "night": 1557, "straw": 1558, "children": 1559, "flowerbed": 1560, "drawn": 1561, "strip": 1562, "yard": 1563, "paved": 1564, "and": 1565, "using": 1566, "u-turn": 1567, "await": 1568, "downward": 1569, "tiled": 1570, "(": 1571, "crystal": 1572, "carts": 1573, "built-in": 1574, "dinette": 1575, "but": 1576, "medical": 1577, "clockwise": 1578, "control": 1579, "enters": 1580, "back": 1581, "dog": 1582, "90": 1583, "arms": 1584, "also": 1585, "hedges": 1586, "chairs/stools": 1587, "against": 1588, "opposite": 1589, "washing": 1590, "transparent": 1591, "blackboard": 1592, "2": 1593, "study": 1594, "sinks": 1595, "passes": 1596, "white": 1597, "short": 1598, "five": 1599, "place": 1600, "push": 1601, "barrier": 1602, "plaid": 1603, "stainless": 1604, "wash": 1605, "surrounded": 1606, "slats": 1607, "beginning": 1608, "stairs": 1609, "couches": 1610, "dollhouse": 1611, "recessed": 1612, "near": 1613, "recliners": 1614, "hedge": 1615, "material": 1616, "faced": 1617, "sop": 1618, "elliptical": 1619, "leopard": 1620, "car": 1621, "pine": 1622, "robes": 1623, "teh": 1624}, "idx2word": {"0": "", "1": "", "2": "", "3": "", "4": "", "5": "?", "6": "circle", "7": "bins", "8": "console", "9": "standing", "10": "radio", "11": "rails", "12": "arriving", "13": "seeing", "14": "yellow", "15": "hallway", "16": "trashcan", "17": "bulletin", "18": "rear", "19": "giant", "20": "could", "21": "yourself", "22": "billiard", "23": "portraits", "24": "watermelon", "25": "upward", "26": "patterned", "27": "christmas", "28": "sixth", "29": "dishwasher", "30": "arm", "31": "3", "32": "floral", "33": "ovens", "34": "copy", "35": "it", "36": "visible", "37": "doorways", "38": "switches", "39": "reach", "40": "crate", "41": "photos", "42": "vessel", "43": "counter", "44": "player", "45": "remain", "46": "handrail", "47": "hardwood", "48": "intersection", "49": "bamboo", "50": "flights", "51": "lined", "52": "can", "53": "business", "54": "wall", "55": "entryway", "56": "still", "57": "jog", "58": "may", "59": "comforter", "60": "talk", "61": "exit", "62": "drum", "63": "across", "64": "slot", "65": "human", "66": "cord", "67": "mats", "68": "elevators", "69": "o'clock", "70": "chalk", "71": "doorknob", "72": "begins", "73": "recreation", "74": "marble-topped", "75": "dividing", "76": "backgammon", "77": "pallet", "78": "pavement", "79": "tub", "80": "zebras", "81": "completely", "82": "orchids", "83": "sharp", "84": "abstract", "85": "upon", "86": "sauna", "87": "arc", "88": "color", "89": "one", "90": "candy", "91": "finally", "92": "person", "93": "text", "94": "signs", "95": "bag", "96": "wal", "97": "curved", "98": "stained", "99": "pieces", "100": "stool", "101": "pointed", "102": "opening", "103": "prints", "104": "indoors", "105": "turn", "106": "indoor", "107": "gated", "108": "pail", "109": "straight", "110": "prior", "111": "suite", "112": "fiction", "113": "drop", "114": "the", "115": "ocean", "116": "upstairs", "117": "tubes", "118": "concrete", "119": "items", "120": "sailboats", "121": "paving", "122": "remaining", "123": "eventually", "124": "below", "125": "workout", "126": "your", "127": "shelves", "128": "sofas", "129": "different", "130": "got", "131": "orange", "132": "n't", "133": "beds", "134": "keyboard", "135": "branch", "136": "petition", "137": "grain", "138": "pair", "139": "silver", "140": "multiple", "141": "brown", "142": "foyer", "143": "tiling", "144": "trunk", "145": "stanchions", "146": "driveway", "147": "brick", "148": "four", "149": "make", "150": "stiars", "151": "314", "152": "boiler", "153": "to", "154": "each", "155": "an", "156": "for", "157": "fruit", "158": "throw", "159": "bottom", "160": "baluster", "161": "160", "162": "upholstered", "163": "infront", "164": "home", "165": "arrive", "166": "6th", "167": "gate", "168": "table", "169": "split", "170": "exits", "171": "topped", "172": "plastic", "173": "stools/chairs", "174": "on", "175": "hear", "176": "heads", "177": "dryer", "178": "wrought", "179": "nearby", "180": "attic", "181": "walk", "182": "going", "183": "slight", "184": "magazine", "185": "bedroom", "186": "easle", "187": "180", "188": "extremely", "189": "golden", "190": "altar", "191": "adjacent", "192": "stairway", "193": "as", "194": "potted", "195": "curtains", "196": "poster", "197": "shag", "198": "part", "199": "cigar", "200": "barred", "201": "only", "202": "jar", "203": "guest", "204": "wand", "205": "cushions", "206": "length", "207": "third", "208": "lake", "209": ")", "210": "planter", "211": "out", "212": "darker", "213": "stuffed", "214": "basketball", "215": "pots", "216": "gallery", "217": "coach", "218": "clothes", "219": "neighboring", "220": "artwork", "221": "down", "222": "any", "223": "flag", "224": "entrance", "225": "''", "226": "ca", "227": "rectangular", "228": "air", "229": "houseplant", "230": "cars", "231": "go", "232": "sailboat", "233": "structure", "234": "degree", "235": "359", "236": "hard", "237": "words", "238": "shallow", "239": "paining", "240": "entertainment", "241": "appears", "242": "teal", "243": "floor", "244": "dispenser", "245": "drumset", "246": "s-c", "247": "shoes", "248": "slab", "249": "bedrooms", "250": "chest", "251": "lime", "252": "kids", "253": "fork", "254": "front", "255": "marilyn", "256": "exited", "257": "blanket", "258": "outdoor", "259": "towel", "260": "shrub", "261": "bay", "262": "barstools", "263": "statues", "264": "barroom", "265": "case", "266": "cases", "267": "dividers", "268": "immediate", "269": "mosaic", "270": "perfume", "271": "beyond", "272": "loveseat", "273": "made", "274": "bushes", "275": "facing", "276": "upwards", "277": "meters", "278": "jam", "279": "lower", "280": "backside", "281": "lounger", "282": "bookcases", "283": "stools", "284": "copier", "285": "waterfall", "286": "new", "287": "fourth", "288": "little", "289": "shrubbery", "290": "bookshelf", "291": "sides", "292": "half", "293": "gravel", "294": "thru", "295": "grandfather", "296": "media", "297": "detector", "298": "lights", "299": "decorated", "300": "2nd", "301": "log", "302": "mantel", "303": "turning", "304": "bbq", "305": "cabinet", "306": "placed", "307": "shelving", "308": "edge", "309": "plants", "310": ".", "311": "alcove", "312": "equipment", "313": "gone", "314": "screened", "315": "walk-in", "316": "bags", "317": "stove", "318": "fish", "319": "marble", "320": "hamper", "321": "again", "322": "placemat", "323": "almost", "324": "checkerboard", "325": "net", "326": "extra", "327": "have", "328": "garage", "329": "not", "330": "position", "331": "piece", "332": "pgi", "333": "ladder", "334": "niche", "335": "dead", "336": "were", "337": "heading", "338": "atrium", "339": "just", "340": "shaped", "341": "ignore", "342": "victorian", "343": "double-doors", "344": "ahead", "345": "passing", "346": "sculptures", "347": "arcade", "348": "inside", "349": "they", "350": "move", "351": "partition", "352": "plug", "353": "faucets", "354": "cushioned", "355": "flight", "356": "bird", "357": "stops", "358": "shelf", "359": "rung", "360": "splits", "361": "rod", "362": "open", "363": "if", "364": "bin", "365": "twelve", "366": ".go", "367": "tan", "368": "breezeway", "369": "drip", "370": "empty", "371": "lamps", "372": "please", "373": "mirror", "374": "photographs", "375": "reddish", "376": "turns", "377": "mid", "378": "hand", "379": "grill", "380": "few", "381": "glasses", "382": "organ", "383": "chars", "384": "ninety", "385": "marked", "386": "encounter", "387": "spot", "388": "dishes", "389": "couple", "390": "reenter", "391": "support", "392": "base", "393": "vending", "394": "stones", "395": "handicap", "396": "central", "397": "barn", "398": "continute", "399": "series", "400": "bananas", "401": "flanked", "402": "nerolie", "403": "ottomans", "404": "weird", "405": "pas", "406": "desks", "407": "stopping", "408": "narrow", "409": "entryways", "410": "mini", "411": "fixture", "412": "leaving", "413": "couch", "414": "work", "415": "above", "416": "leather", "417": "seat", "418": "bars", "419": "image", "420": "tanning", "421": "twelfth", "422": "bathtub", "423": "rock", "424": "heater", "425": "reaching", "426": "fireplace", "427": "countertop", "428": "banisters", "429": "music", "430": "landing", "431": "'", "432": "draped", "433": "ottoman", "434": "round", "435": "unit", "436": "house", "437": "rope", "438": "last", "439": "left-hand", "440": "racks", "441": "02", "442": "chandeliers", "443": "steel", "444": "urns", "445": "bottles", "446": "teddy", "447": "find", "448": "brass", "449": "well", "450": "hundred", "451": "staff", "452": "winding", "453": "bare", "454": "reading", "455": "green", "456": "posters", "457": "drawers", "458": "leave", "459": "motion", "460": "target", "461": "foot", "462": "pillow", "463": "basket", "464": "these", "465": "5", "466": "most", "467": "wicker", "468": "carpeted", "469": "lighted", "470": "continue", "471": "mantle", "472": "outdoors", "473": "pole", "474": "even", "475": "t", "476": "thermostat", "477": "full", "478": "trapdoor", "479": "letters", "480": "breakfast", "481": "mirrored", "482": "pathway", "483": "main", "484": "located", "485": "do", "486": "globe", "487": "foward", "488": "projector", "489": "nook", "490": "flower", "491": "people", "492": "spiral", "493": "immediately", "494": "similar", "495": "weight", "496": "treadmill", "497": "many", "498": "digital", "499": "veranda", "500": "fountain", "501": "cinema", "502": "gold", "503": "scenes", "504": "blue", "505": "hit", "506": "locker", "507": "instead", "508": "blinds", "509": "garden", "510": "th", "511": "mountain", "512": "drinking", "513": "beige", "514": "hangings", "515": "ways", "516": "sitting", "517": "space", "518": "lobby", "519": "bouquet", "520": "reception", "521": "degress", "522": "stool/chairs", "523": "easel", "524": "dresser", "525": "tot", "526": "division", "527": "bicycle", "528": "stacked", "529": "20", "530": "promptly", "531": "game", "532": "butler", "533": "rolls", "534": "aluminum", "535": "following", "536": "lips", "537": "pot", "538": ",", "539": "about", "540": "unfinished", "541": "enter", "542": "piano", "543": "banquet", "544": "radiator", "545": "chair", "546": "printed", "547": "tile", "548": "filled", "549": "design", "550": "'s", "551": "china", "552": "bathroom", "553": "4th", "554": "level", "555": "candle", "556": "wine", "557": "exterior", "558": "view", "559": "rightside", "560": "longer", "561": "fan", "562": "low", "563": "alter", "564": "rugs", "565": "meter", "566": "family", "567": "tubs", "568": "lampshade", "569": "residence", "570": "so", "571": "striaght", "572": "grated", "573": "navigate", "574": "hammock", "575": "tables", "576": "bathrooms", "577": "through", "578": "washroom", "579": "patchwork", "580": "stairwell", "581": "11th", "582": "display", "583": "wat", "584": "rows", "585": "divider", "586": "now", "587": "under", "588": "rack", "589": "master", "590": "same", "591": "hexagon", "592": "o", "593": "road", "594": "becomes", "595": "circular", "596": "perpendicular", "597": "tray", "598": "hug", "599": "ship", "600": "area", "601": "garbage", "602": "hallways", "603": "bast", "604": "arches", "605": "restrooms", "606": "rights", "607": "trash", "608": "screen", "609": "bunch", "610": "guitars", "611": "drinks", "612": "clear", "613": "cellar", "614": "pattern", "615": "tablecloth", "616": "approach", "617": "entry", "618": "veering", "619": "recliner", "620": "lefts", "621": "traveling", "622": "hall", "623": "nice", "624": "body", "625": "instruments", "626": "bit", "627": "common", "628": "desk", "629": "tanks", "630": "triptych", "631": "building", "632": "handle", "633": "final", "634": "antelope", "635": "doormat", "636": "old", "637": "he", "638": "extreme", "639": "fur", "640": "heating", "641": "by", "642": "security", "643": "overhead", "644": "which", "645": "passed", "646": "leaf", "647": "1", "648": "cushion", "649": "beach", "650": "keys", "651": "post", "652": "path", "653": "den", "654": "bed", "655": "staying", "656": "vaulted", "657": "outlet", "658": "making", "659": "s", "660": "floors", "661": "tree", "662": "rocks", "663": "when", "664": "st", "665": "doorway", "666": "available", "667": "reached", "668": "meeting", "669": "bathtubs", "670": "urinals", "671": "line", "672": "chandelier", "673": "son", "674": "monitor", "675": "via", "676": "triangle", "677": "revolving", "678": "direction", "679": "dirt", "680": "downwards", "681": "pillar", "682": "glass", "683": "ends", "684": "enterance", "685": "painted", "686": "diagonally", "687": "fabric", "688": "railing", "689": "dots", "690": "prayer", "691": "overlooking", "692": "guitarist", "693": "rooms", "694": "than", "695": "walkthrough", "696": "upper", "697": "re-enter", "698": "animals", "699": "hang", "700": "passageway", "701": "fixtures", "702": "door", "703": "river", "704": "possible", "705": "sidewalk", "706": "dressing", "707": "kitchenette", "708": "smoke", "709": "bottle", "710": "keeping", "711": "stalls", "712": "strait", "713": "chessboard", "714": "island", "715": "swinging", "716": "twenty", "717": "angle", "718": "screens", "719": "theater", "720": "towels", "721": "plant", "722": "object", "723": "contains", "724": "lattice", "725": "there", "726": "other", "727": "velvet", "728": "bookshelves", "729": "counters", "730": "windows", "731": "office", "732": "sofa", "733": "man", "734": "love", "735": "leftmost", "736": "wrap", "737": "rotate", "738": "cylindrical", "739": "arrived", "740": "chairs", "741": "palm", "742": ";", "743": "follow", "744": "this", "745": "thin", "746": "say", "747": "whiteboard", "748": "rather", "749": "ping-pong", "750": "food", "751": "entire", "752": "should", "753": "while", "754": "bridge", "755": "holes", "756": "machine", "757": "enerskin", "758": "goes", "759": "built", "760": "stone", "761": "rotary", "762": "books", "763": "burgundy", "764": "powder", "765": "or", "766": "horse", "767": "animal", "768": "settee", "769": "frame", "770": "light", "771": "snail", "772": "whose", "773": "armchairs", "774": "flowers", "775": "nine", "776": "need", "777": "lots", "778": "flowered", "779": "handles", "780": "able", "781": "climb", "782": "deck", "783": "kitchen", "784": "finished", "785": "sized", "786": "quickly", "787": "bath", "788": "wiat", "789": "fire", "790": "tops", "791": "awning", "792": "stepping", "793": "squared", "794": "double-doored", "795": "more", "796": "break", "797": "group", "798": "chaise", "799": "crosses", "800": "throuhg", "801": "trieste", "802": "style", "803": "eight", "804": "344", "805": "feather", "806": "ropes", "807": "meet", "808": "water", "809": "corners", "810": "three", "811": "lines", "812": "curve", "813": "hollow", "814": "middle", "815": "sing", "816": "up", "817": "candelabra", "818": "apples", "819": "before", "820": "everything", "821": "paneled", "822": "tusks", "823": "windowed", "824": "fireplaces", "825": "gym", "826": "climbing", "827": "lamp", "828": "wide", "829": "siting", "830": "want", "831": "arrow", "832": "l-shaped", "833": "in", "834": "bush", "835": "phone", "836": "done", "837": "skinny", "838": "beam", "839": "tall", "840": "unicycle", "841": "travel", "842": "columns", "843": "urn", "844": "saloon", "845": "would", "846": "remainder", "847": "girl", "848": "rustic", "849": "seating", "850": "beside", "851": "theatre", "852": "halls", "853": "buffet", "854": "board", "855": "six", "856": "45", "857": "canisters", "858": "ascending", "859": "mirrors", "860": "pantry", "861": "though", "862": "feet", "863": "destination", "864": "metal", "865": "makeup", "866": "welcome", "867": "rail", "868": "several", "869": "jacuzzi", "870": "cupboards", "871": "big", "872": "descend", "873": "slightly", "874": "leads", "875": "pool", "876": "stick", "877": "panel", "878": "pong", "879": "station", "880": "vanity", "881": "inset", "882": "stationary", "883": "approximately", "884": "either", "885": "pulp", "886": "massive", "887": "'ll", "888": "build", "889": "very", "890": "baskets", "891": "graphic", "892": "statute", "893": "bigger", "894": "smaller", "895": "fern", "896": "use", "897": "indoor/outdoor", "898": "sets", "899": "face", "900": "art", "901": "foosball", "902": "vine", "903": "hockey", "904": "lighting", "905": "television", "906": "lead", "907": "street", "908": "liquor", "909": "word", "910": "oval", "911": "pillows", "912": "curtain", "913": "barbeque", "914": "drawing", "915": "exiting", "916": "backyard", "917": "locate", "918": "past", "919": "wooden", "920": "instructions", "921": "stepped", "922": "plaque", "923": "court", "924": "dinner", "925": "stall", "926": "ground", "927": "thorough", "928": "hands", "929": "refrigerator", "930": "headed", "931": "mat", "932": "cane", "933": "cooler", "934": "lady", "935": "takes", "936": "set", "937": "closets", "938": "like", "939": "handicapped", "940": "pews", "941": "woman", "942": "wait", "943": "saxophone", "944": "beauty", "945": "stars", "946": "scene", "947": "walking", "948": "wheelchair", "949": "dinging", "950": "froward", "951": "followed", "952": "come", "953": "shape", "954": "deer", "955": "lounging", "956": "drawings", "957": "behind", "958": "exercise", "959": "forwards", "960": "sconce", "961": "stand", "962": "leading", "963": "panes", "964": "pivot", "965": "does", "966": "armoire", "967": "farm", "968": "railings", "969": "went", "970": "french", "971": "proceed", "972": "coat", "973": "larger", "974": "rice", "975": "plate", "976": "bike", "977": "treadmills", "978": "ornate", "979": "beams", "980": "looks", "981": "int", "982": "loungers", "983": "striped", "984": "tiles", "985": "motorcycle", "986": "chimney", "987": "thought", "988": "hot", "989": "towards", "990": "closest", "991": "tennis", "992": "arched", "993": "gray", "994": "moment", "995": "closet/bathroom", "996": "basement", "997": "hutch", "998": "seats", "999": "photo", "1000": "cover", "1001": "closet", "1002": "bench", "1003": "areas", "1004": "from", "1005": "minibar", "1006": "sculpture", "1007": "onto", "1008": "slider", "1009": "drawer", "1010": "sliding", "1011": "chevron", "1012": "checkered", "1013": "doorframe", "1014": "thirty", "1015": "nest", "1016": "labeled", "1017": "rightmost", "1018": "bear", "1019": "host", "1020": "showers", "1021": "flooring", "1022": "dual", "1023": "getting", "1024": "ping", "1025": "single", "1026": "starting", "1027": "finish", "1028": "soon", "1029": "throught", "1030": "wooded", "1031": "buildings", "1032": "sunset", "1033": "wall-mounted", "1034": "stop", "1035": "immediatly", "1036": "walls", "1037": "padded", "1038": "movie", "1039": "print", "1040": "ceramic", "1041": "corridor", "1042": "granite", "1043": "runner", "1044": "tight", "1045": "einstein", "1046": "waiting", "1047": "entranceway", "1048": "some", "1049": "restaurant", "1050": "roof", "1051": "another", "1052": "red", "1053": "popcorn", "1054": "paper", "1055": "mermaid", "1056": "eighty", "1057": "holding", "1058": "flat", "1059": "ones", "1060": "armchair", "1061": "paneling", "1062": "counting", "1063": "them", "1064": "is", "1065": "crossing", "1066": "computer", "1067": "frosted", "1068": "paces", "1069": "boy", "1070": "labelled", "1071": "hyena", "1072": "further", "1073": "ached", "1074": "continuing", "1075": "'ve", "1076": "two", "1077": "native", "1078": "rafter", "1079": "living", "1080": "barbecue", "1081": "change", "1082": "around", "1083": "sheets", "1084": "has", "1085": "elevator", "1086": "appliances", "1087": "swimming", "1088": "ad", "1089": "oven", "1090": "library", "1091": "its", "1092": "distance", "1093": "openings", "1094": "chair/stools", "1095": "peices", "1096": "long", "1097": "chrome", "1098": "solid", "1099": "window", "1100": "wardrobe", "1101": "step", "1102": "breaker", "1103": "rectangle", "1104": "rest", "1105": "halt", "1106": "end", "1107": "threshold", "1108": "closed", "1109": "taking", "1110": "panels", "1111": "run", "1112": "thing", "1113": "pictures", "1114": "pink", "1115": "construction", "1116": "veer", "1117": "nightstand", "1118": "walks", "1119": "closer", "1120": "pipes", "1121": "dot", "1122": "banister", "1123": "framed", "1124": "-", "1125": "women", "1126": "walkway", "1127": "high", "1128": "staircase", "1129": "furniture", "1130": "attached", "1131": "cream", "1132": "grate", "1133": "moving", "1134": "cement", "1135": "enclosure", "1136": "mini-fridge", "1137": "decorative", "1138": "plaques", "1139": "cylinder", "1140": "landscape", "1141": "downtown", "1142": "peice", "1143": "bistro", "1144": "diamond", "1145": "pipe", "1146": "underneath", "1147": "aqua", "1148": "barrel", "1149": "you", "1150": "furthest", "1151": "opp", "1152": "purple", "1153": "looking", "1154": "beautiful", "1155": "wires", "1156": "then", "1157": "bowl", "1158": "tapestry", "1159": "large", "1160": "off", "1161": "daybed", "1162": "lounge", "1163": "camera", "1164": "interior", "1165": "scale", "1166": "'re", "1167": "materials", "1168": "being", "1169": "folding", "1170": "bookcase", "1171": "11", "1172": "playroom", "1173": "along", "1174": "over", "1175": "beneath", "1176": "hose", "1177": "iron", "1178": "are", "1179": "wastebasket", "1180": "e", "1181": "cacti", "1182": "guardrail", "1183": "terrace", "1184": "pendant", "1185": "centerpiece", "1186": "clock", "1187": "archway", "1188": "a", "1189": "after", "1190": "gilt", "1191": "right", "1192": "says", "1193": "surrounding", "1194": "reclining", "1195": "point", "1196": "bannister", "1197": "collection", "1198": "shoe", "1199": "chaises", "1200": "curtained", "1201": "colorful", "1202": "spaces", "1203": "guitar", "1204": "portrait", "1205": "balcony", "1206": "entering", "1207": "339", "1208": "fence", "1209": "meets", "1210": "containing", "1211": "ascend", "1212": "kitchen/living", "1213": "monroe", "1214": "vases", "1215": "barbers", "1216": "extinguisher", "1217": "polka", "1218": "credenza", "1219": "ladies", "1220": "sink", "1221": "circles", "1222": "both", "1223": "candles", "1224": "left", "1225": "benches", "1226": "our", "1227": "horizontal", "1228": "elephant", "1229": "toward", "1230": "written", "1231": "planters", "1232": "mural", "1233": "men", "1234": "formal", "1235": "bar", "1236": "current", "1237": "sword", "1238": "started", "1239": "whole", "1240": "3rd", "1241": "giraffe", "1242": "walled", "1243": "cathedral", "1244": "those", "1245": "double", "1246": "cow", "1247": "changes", "1248": "wallpaper", "1249": "take", "1250": "til", "1251": "cheetah", "1252": "column", "1253": "section", "1254": "ledge", "1255": "fridge", "1256": "bust", "1257": "maroon", "1258": "downstairs", "1259": "homes", "1260": "foreward", "1261": "statue", "1262": "where", "1263": "fifth", "1264": "urinal", "1265": "switch", "1266": "salon", "1267": "begin", "1268": "pointing", "1269": "with", "1270": "bridal", "1271": "showing", "1272": "toilet", "1273": "crab", "1274": "shop", "1275": "tiny", "1276": "pass", "1277": "lit", "1278": "things", "1279": "serving", "1280": "tea", "1281": "skin", "1282": "boards", "1283": "next", "1284": "oil", "1285": "herringbone", "1286": "plates", "1287": "small", "1288": "wing", "1289": "printer", "1290": "boat", "1291": "entered", "1292": "seven", "1293": "here", "1294": "once", "1295": "start", "1296": "orient", "1297": "carpet", "1298": "enclosed", "1299": "pocket", "1300": "dome", "1301": "will", "1302": "pew", "1303": "no", "1304": "zebra", "1305": "canvas", "1306": "runs", "1307": "having", "1308": "platform", "1309": "raised", "1310": "all", "1311": "loft", "1312": "stoves", "1313": "shower", "1314": "greyish", "1315": "star", "1316": "cardboard", "1317": "table-tennis", "1318": "stay", "1319": "laundry", "1320": "``", "1321": "impact", "1322": "store", "1323": "char", "1324": "mounted", "1325": "check", "1326": "geometric", "1327": "tank", "1328": "bedside", "1329": "staircases", "1330": "degrees", "1331": "electrical", "1332": "look", "1333": "head", "1334": "row", "1335": "frames", "1336": "buddha", "1337": "archways", "1338": "roped", "1339": "square", "1340": "futon", "1341": "en", "1342": "dogleg", "1343": "box", "1344": "decoration", "1345": "floored", "1346": "#", "1347": "fake", "1348": "black", "1349": "metre", "1350": "opportunity", "1351": "goright", "1352": "rocking", "1353": "metallic", "1354": "massage", "1355": "6", "1356": "receptionist", "1357": "of", "1358": "jump", "1359": "sit", "1360": "parallel", "1361": "rug", "1362": "midway", "1363": "opens", "1364": "halfway", "1365": "fitness", "1366": "4", "1367": "let", "1368": "pause", "1369": "class", "1370": "be", "1371": "avoid", "1372": "sticking", "1373": "pig", "1374": "shuttered", "1375": "book", "1376": "forward", "1377": "that", "1378": "glass-topped", "1379": "seventh", "1380": "top", "1381": "boxes", "1382": "great", "1383": "hanging", "1384": "ventilation", "1385": "number", "1386": "conference", "1387": "utility", "1388": "sign", "1389": "tun", "1390": "pane", "1391": "background", "1392": "block", "1393": "telephone", "1394": "plywood", "1395": "curio", "1396": "machines", "1397": "gazebo", "1398": "workbench", "1399": "quarter", "1400": "exist", "1401": "advance", "1402": "microwave", "1403": "bathing", "1404": "cabinets", "1405": "fancy", "1406": "walked", "1407": "pillars", "1408": "separating", "1409": "workroom", "1410": "setting", "1411": "ballroom", "1412": "turk", "1413": "odd-shaped", "1414": "grey", "1415": "grand", "1416": "stairways", "1417": "ten", "1418": "oriental", "1419": "drums", "1420": "get", "1421": "away", "1422": "outside", "1423": "billiards", "1424": "decorations", "1425": "cross", "1426": "american", "1427": "hub", "1428": "opened", "1429": "entrances", "1430": "l", "1431": "cactus", "1432": "wet", "1433": "lockers", "1434": "church", "1435": "doorstep", "1436": "intersecting", "1437": "stage", "1438": "vertical", "1439": "diagonal", "1440": "woven", "1441": "video", "1442": "ski", "1443": "center", "1444": "arch", "1445": "sharply", "1446": "at", "1447": "vertically", "1448": "cloth", "1449": "anymore", "1450": "fountains", "1451": "porch", "1452": "rounded", "1453": "birdcage", "1454": "first", "1455": "carpeting", "1456": "minifridge", "1457": "vase", "1458": "goodfellas", "1459": "plank", "1460": "dark", "1461": "vent", "1462": "storage", "1463": "antechamber", "1464": "forty-five", "1465": "instruction", "1466": "dressers", "1467": "aside", "1468": "tv", "1469": "alongside", "1470": "boulders", "1471": "into", "1472": "descending", "1473": "barber", "1474": "posts", "1475": "wreath", "1476": "cowhide", "1477": "direclty", "1478": "curving", "1479": "projection", "1480": "dinning", "1481": "u", "1482": "second", "1483": "farthest", "1484": "twin", "1485": "trees", "1486": "passage", "1487": "keep", "1488": "skylight", "1489": "covered", "1490": "close", "1491": "lion", "1492": "steps", "1493": "grass", "1494": "doors", "1495": "put", "1496": "spa", "1497": "wood", "1498": "arrangement", "1499": "sectional", "1500": "stair", "1501": "adjoining", "1502": "hat", "1503": "faucet", "1504": "handing", "1505": "room", "1506": "corner", "1507": "wheel", "1508": "till", "1509": "swing", "1510": "lawn", "1511": "extinguishers", "1512": "side", "1513": "colored", "1514": "between", "1515": "marbled", "1516": "changing", "1517": "curves", "1518": "pedestal", "1519": "ball", "1520": "hats", "1521": "read", "1522": "identical", "1523": "ceiling", "1524": "quick", "1525": "really", "1526": "paintings", "1527": "supplies", "1528": "original", "1529": "washer/dryer", "1530": "plan", "1531": "dining", "1532": "washer", "1533": "aisle", "1534": "sun", "1535": "semi", "1536": "far", "1537": "footstool", "1538": "goal", "1539": "patio", "1540": "way", "1541": "reverse", "1542": "directly", "1543": "wooden/glass", "1544": "too", "1545": "until", "1546": "sunken", "1547": "painting", "1548": "coffee", "1549": "see", "1550": "intersects", "1551": "picture", "1552": "their", "1553": "without", "1554": "huge", "1555": "restroom", "1556": "nearest", "1557": "night", "1558": "straw", "1559": "children", "1560": "flowerbed", "1561": "drawn", "1562": "strip", "1563": "yard", "1564": "paved", "1565": "and", "1566": "using", "1567": "u-turn", "1568": "await", "1569": "downward", "1570": "tiled", "1571": "(", "1572": "crystal", "1573": "carts", "1574": "built-in", "1575": "dinette", "1576": "but", "1577": "medical", "1578": "clockwise", "1579": "control", "1580": "enters", "1581": "back", "1582": "dog", "1583": "90", "1584": "arms", "1585": "also", "1586": "hedges", "1587": "chairs/stools", "1588": "against", "1589": "opposite", "1590": "washing", "1591": "transparent", "1592": "blackboard", "1593": "2", "1594": "study", "1595": "sinks", "1596": "passes", "1597": "white", "1598": "short", "1599": "five", "1600": "place", "1601": "push", "1602": "barrier", "1603": "plaid", "1604": "stainless", "1605": "wash", "1606": "surrounded", "1607": "slats", "1608": "beginning", "1609": "stairs", "1610": "couches", "1611": "dollhouse", "1612": "recessed", "1613": "near", "1614": "recliners", "1615": "hedge", "1616": "material", "1617": "faced", "1618": "sop", "1619": "elliptical", "1620": "leopard", "1621": "car", "1622": "pine", "1623": "robes", "1624": "teh"}, "idx": 1625} \ No newline at end of file diff --git a/ss_baselines/savi/dialog/ques_gen/ques_gen.py b/ss_baselines/savi/dialog/ques_gen/ques_gen.py new file mode 100644 index 0000000..e2c9710 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/ques_gen.py @@ -0,0 +1,136 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: CC-BY-4.0 + +from torchvision import transforms +import os +import torch +import sys +import numpy as np +# sys.path.append('./utils/') +import torch.nn as nn +import torchvision.models as models + +from ss_baselines.savi.dialog.ques_gen.models.iq_vln import IQ_VLN +from ss_baselines.savi.dialog.ques_gen.utils import load_vocab +from ss_baselines.savi.dialog.ques_gen.utils import get_glove_embedding + +# need to add logger too + +# set up the parameters +def set_params(): + params = {} + params['model_path'] = './ss_baselines/savi/dialog/ques_gen/weights/vqg-tf-50.pkl' + params['max_length'] = 20 + params['hidden_size'] = 512 + params['embedding_dim'] = 300 + params['num_layers'] = 1 + params['rnn_cell'] = 'LSTM' + params['dropout_p'] = 0.3 + params['input_dropout_p'] = 0.3 + params['encoder_max_len'] = 4 # ?? + params['num_att_layers'] = 2 + params['z_size'] = 100 + params['no_answer_recon'] = True + params['no_image_recon'] = True + params['no_category_space'] = True + params['vocab_path'] = './ss_baselines/savi/dialog/ques_gen/processed/vocab_iq_vln.json' + params['embedding_name'] = '6B' + + return params + + +class DecentralizedDistributedMixinQuesGen: + def init_distributed(self, find_unused_params: bool = True) -> None: + r"""Initializes distributed training for the model + 1. Broadcasts the model weights from world_rank 0 to all other workers + 2. Adds gradient hooks to the model + :param find_unused_params: Whether or not to filter out unused parameters + before gradient reduction. This *must* be True if + there are any parameters in the model that where unused in the + forward pass, otherwise the gradient reduction + will not work correctly. + """ + # NB: Used to hide the hooks from the nn.Module, + # so they don't show up in the state_dict + class Guard: + def __init__(self, model, device): + if torch.cuda.is_available(): + self.ddp = torch.nn.parallel.DistributedDataParallel( + model, device_ids=[device], output_device=device + ) + else: + self.ddp = torch.nn.parallel.DistributedDataParallel(model) + + self._ddp_hooks = Guard(self, self.device) + + self.reducer = self._ddp_hooks.ddp.reducer + self.find_unused_params = find_unused_params + + def before_backward(self, loss): + if self.find_unused_params: + self.reducer.prepare_for_backward([loss]) + else: + self.reducer.prepare_for_backward([]) + + +class QuesGen: + def __init__(self, device): + params = set_params() + self.device = device + # getting float with range (0,255) and also tensor + # divide it with 255 and normalize + # dividing it in ques_out() + # need to make it consistant with ques generation training + self.transform = transforms.Compose([transforms.Normalize(mean=[0.3853, 0.3853, 0.3855], + std=[0.1050, 0.1050, 0.1047]) + ]) + + self.vocab = load_vocab(params['vocab_path']) + + # should always use glove(or any other learned) embedding + embedding = get_glove_embedding(params['embedding_name'], + params['embedding_dim'], + self.vocab) + + # Initialize model + self.vqg = IQ_VLN(len(self.vocab), params['max_length'], params['hidden_size'], params['embedding_dim'], + self.vocab(self.vocab.SYM_SOQ), self.vocab(self.vocab.SYM_EOS), + num_layers=params['num_layers'], + rnn_cell=params['rnn_cell'], + dropout_p=params['dropout_p'], + input_dropout_p=params['input_dropout_p'], + encoder_max_len=params['encoder_max_len'], + embedding=embedding, + num_att_layers=params['num_att_layers'], + z_size=params['z_size'], + no_answer_recon=params['no_answer_recon'], + no_image_recon=params['no_image_recon'], + no_category_space=params['no_category_space']) + + self.vqg.load_state_dict(torch.load(params['model_path'])) + self.vqg.to(device=self.device) + ''' + if torch.cuda.is_available(): + self.vqg.cuda() + self.vqg.eval() + ''' + + def ques_out(self, image): + # as getting float with range (0, 255) + # need to make it consistant with training + image = image/255.0 + image = self.transform(image) + output = self.vqg.predict_from_image(image) + # let's consider only single image + # if multiple image then next line needs to be updated + ques = self.vocab.tokens_to_words(output[0]) + return ques + + +class QuesGenDDP(QuesGen, DecentralizedDistributedMixinQuesGen): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) diff --git a/ss_baselines/savi/dialog/ques_gen/utils/__init__.py b/ss_baselines/savi/dialog/ques_gen/utils/__init__.py new file mode 100644 index 0000000..c73c557 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/utils/__init__.py @@ -0,0 +1,6 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +from .train_utils import get_glove_embedding +from .vocab_vln import load_vocab diff --git a/ss_baselines/savi/dialog/ques_gen/utils/train_utils.py b/ss_baselines/savi/dialog/ques_gen/utils/train_utils.py new file mode 100644 index 0000000..52bdce0 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/utils/train_utils.py @@ -0,0 +1,209 @@ +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: MIT + +"""Utility functions for training. +""" + +import json +import torch +import torchtext +import sys +import numpy as np + + +# =========================================================== +# Vocabulary. +# =========================================================== + +class Vocabulary(object): + """Keeps track of all the words in the vocabulary. + """ + + # Reserved symbols + SYM_PAD = '' # padding. + SYM_SOQ = '' # Start of question. + SYM_SOR = '' # Start of response. + SYM_EOS = '' # End of sentence. + SYM_UNK = '' # Unknown word. + SYM_QUES = '?' + + def __init__(self): + """Constructor for Vocabulary. + """ + # Init mappings between words and ids + self.word2idx = {} + self.idx2word = {} + self.idx = 0 + self.add_word(self.SYM_PAD) + self.add_word(self.SYM_SOQ) + self.add_word(self.SYM_SOR) + self.add_word(self.SYM_EOS) + self.add_word(self.SYM_UNK) + self.add_word(self.SYM_QUES) + + def add_word(self, word): + """Adds a new word and updates the total number of unique words. + + Args: + word: String representation of the word. + """ + if word not in self.word2idx: + self.word2idx[word] = self.idx + self.idx2word[self.idx] = word + self.idx += 1 + + def remove_word(self, word): + """Removes a specified word and updates the total number of unique words. + + Args: + word: String representation of the word. + """ + if word in self.word2idx: + self.word2idx.pop(word) + self.idx2word.pop(self.idx) + self.idx -= 1 + + def __call__(self, word): + if word not in self.word2idx: + return self.word2idx[self.SYM_UNK] + return self.word2idx[word] + + def __len__(self): + return len(self.word2idx) + + def save(self, location): + with open(location, 'w') as f: + json.dump({'word2idx': self.word2idx, + 'idx2word': self.idx2word, + 'idx': self.idx}, f) + + def load(self, location): + with open(location, 'r') as f: + data = json.load(f) + self.word2idx = data['word2idx'] + self.idx2word = data['idx2word'] + self.idx = data['idx'] + + def tokens_to_words(self, tokens): + """Converts tokens to vocab words. + + Args: + tokens: 1D Tensor of Token outputs. + + Returns: + A list of words. + """ + words = [] + for token in tokens: + word = self.idx2word[str(token.item()) if str(token.item()) in self.idx2word.keys() else token.item()] + if word == self.SYM_EOS: + break + if word not in [self.SYM_PAD, self.SYM_SOQ, + self.SYM_SOR, self.SYM_EOS]: + words.append(word) + sentence = str(' '.join(words)) + return sentence + + def words_to_tokens(self, words): + """Converts tokens to vocab words. + + Args: + tokens: 1D Tensor of Token outputs. + + Returns: + A list of words. + """ + tokens = [] + for word in words.split(' '): + if word in self.word2idx.keys(): + token = self.word2idx[word] + else: + token = self.word2idx[self.SYM_UNK] + tokens.append(token) + return np.array(tokens) + + +def get_glove_embedding(name, embed_size, vocab): + """Construct embedding tensor. + + Args: + name (str): Which GloVe embedding to use. + embed_size (int): Dimensionality of embeddings. + vocab: Vocabulary to generate embeddings. + Returns: + embedding (vocab_size, embed_size): Tensor of + GloVe word embeddings. + """ + + glove = torchtext.vocab.GloVe(name=name, + dim=str(embed_size)) + vocab_size = len(vocab) + embedding = torch.zeros(vocab_size, embed_size) + for i in range(vocab_size): + embedding[i] = glove[vocab.idx2word[str(i) if str(i) in vocab.idx2word.keys() else i]] + return embedding + + +# =========================================================== +# Helpers. +# =========================================================== + +def process_lengths(inputs, pad=0): + """Calculates the lenght of all the sequences in inputs. + + Args: + inputs: A batch of tensors containing the question or response + sequences. + + Returns: A list of their lengths. + """ + max_length = inputs.size(1) + if inputs.size(0) == 1: + lengths = list(max_length - inputs.data.eq(pad).sum(1)) + else: + lengths = list(max_length - inputs.data.eq(pad).sum(1).squeeze()) + return lengths + + +# =========================================================== +# Evaluation metrics. +# =========================================================== + +def gaussian_KL_loss(mus, logvars, eps=1e-8): + """Calculates KL distance of mus and logvars from unit normal. + + Args: + mus: Tensor of means predicted by the encoder. + logvars: Tensor of log vars predicted by the encoder. + + Returns: + KL loss between mus and logvars and the normal unit gaussian. + """ + KLD = -0.5 * torch.sum(1 + logvars - mus.pow(2) - logvars.exp()) + kl_loss = KLD/(mus.size(0) + eps) + """ + if kl_loss > 100: + print kl_loss + print KLD + print mus.min(), mus.max() + print logvars.min(), logvars.max() + 1/0 + """ + return kl_loss + + +def vae_loss(outputs, targets, mus, logvars, criterion): + """VAE loss that combines cross entropy with KL divergence. + + Args: + outputs: The predictions made by the model. + targets: The ground truth indices in the vocabulary. + mus: Tensor of means predicted by the encoder. + logvars: Tensor of log vars predicted by the encoder. + criterion: The cross entropy criterion. + """ + CE = criterion(outputs, targets) + # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) + KLD = gaussian_KL_loss(mus, logvars) + return CE + KLD diff --git a/ss_baselines/savi/dialog/ques_gen/utils/vocab_vln.py b/ss_baselines/savi/dialog/ques_gen/utils/vocab_vln.py new file mode 100644 index 0000000..d72fc59 --- /dev/null +++ b/ss_baselines/savi/dialog/ques_gen/utils/vocab_vln.py @@ -0,0 +1,142 @@ +# Copyright (C) 2022-2023 Mitsubishi Electric Research Laboratories (MERL) +# Copyright (C) 2019, Ranjay Krishna +# +# SPDX-License-Identifier: AGPL-3.0-or-later +# SPDX-License-Identifier: MIT + +"""Creates a vocabulary using iq_dataset for the vqa dataset. +""" + +from collections import Counter +from ss_baselines.savi.dialog.ques_gen.utils import train_utils + +import argparse +import json +import logging +import nltk +import numpy as np +import re +import sys + + +def process_text(text, vocab, max_length=20): + """Converts text into a list of tokens surrounded by and . + + Args: + text: String text. + vocab: The vocabulary instance. + max_length: The max allowed length. + + Returns: + output: An numpy array with tokenized text. + length: The length of the text. + """ + tokens = tokenize(text.lower().strip()) + output = [] + output.append(vocab(vocab.SYM_SOQ)) # + output.extend([vocab(token) for token in tokens]) + output.append(vocab(vocab.SYM_EOS)) # + length = min(max_length, len(output)) + return np.array(output[:length]), length + + +def load_vocab(vocab_path): + """Load Vocabulary object from a pickle file. + + Args: + vocab_path: The location of the vocab pickle file. + + Returns: + A Vocabulary object. + """ + vocab = train_utils.Vocabulary() + vocab.load(vocab_path) + return vocab + + +def tokenize(sentence): + """Tokenizes a sentence into words. + + Args: + sentence: A string of words. + + Returns: + A list of words. + """ + # sentence = sentence.decode('utf8') + if len(sentence) == 0: + return [] + sentence = re.sub('\.+', r'.', sentence) + sentence = re.sub('([a-z])([.,!?()])', r'\1 \2 ', sentence) + sentence = re.sub('\s+', ' ', sentence) + + tokens = nltk.tokenize.word_tokenize( + sentence.strip().lower()) + return tokens + + +def build_vocab(annot, threshold): + """Build a vocabulary from the annotations. + + Args: + annot: A list file containing the instruction with other annotation. + threshold: The minimum number of times a word must occur. Otherwise it + is treated as an `Vocabulary.SYM_UNK`. + + Returns: + A Vocabulary object. + """ + + words = [] + counter = Counter() + for i, entry in enumerate(annot): + for inst in entry['instructions']: + question = inst #.encode('utf8') + q_tokens = tokenize(question) + counter.update(q_tokens) + + if i % 1000 == 0: + logging.info("worked on %d entries." % (i)) + + # If a word frequency is less than 'threshold', then the word is discarded. + words.extend([word for word, cnt in counter.items() if cnt >= threshold]) + words = list(set(words)) + vocab = create_vocab(words) + return vocab + + +def create_vocab(words): + # Adds the words to the vocabulary. + vocab = train_utils.Vocabulary() + for i, word in enumerate(words): + vocab.add_word(word) + return vocab + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + # Hyperparameters. + parser.add_argument('--threshold', type=int, default=2, + help='Minimum word count threshold.') + + # Outputs. + parser.add_argument('--vocab-path', type=str, + default='data/processed/vocab_iq_vln.json', + help='Path for saving vocabulary wrapper for vln.') + args = parser.parse_args() + + # make sure to add symlink + # for example: + # ln -s /homes/supaul/Desktop/H/data/Fine-Grained-R2R /homes/supaul/sudipta/work/dialog_module/ques_gen/iq/data/raw + tr_path = './data/raw/Fine-Grained-R2R/data/FGR2R_train.json' + # tr is a list + with open(tr_path) as f: + tr = json.load(f) + + # Configure logging + logging.basicConfig(level=logging.INFO) + vocab = build_vocab(tr, args.threshold) + logging.info("Total vocabulary size: %d" % len(vocab)) + vocab.save(args.vocab_path) + logging.info("Saved the vocabulary wrapper to '%s'" % args.vocab_path) diff --git a/ss_baselines/savi/dialog/ques_gen/weights/.gitkeep b/ss_baselines/savi/dialog/ques_gen/weights/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/ss_baselines/savi/dialog/speaker/CMakeLists.txt b/ss_baselines/savi/dialog/speaker/CMakeLists.txt new file mode 100644 index 0000000..d1314ae --- /dev/null +++ b/ss_baselines/savi/dialog/speaker/CMakeLists.txt @@ -0,0 +1,80 @@ +project(Matterport_Simulator CXX) +cmake_minimum_required(VERSION 2.8) + +option(OSMESA_RENDERING "Render offscreen with OSMesa" ON) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +include_directories("${PROJECT_SOURCE_DIR}/include") + +find_package(OpenCV REQUIRED) +#find_package(OpenCV REQUIRED +# PATHS "/home/dfried/lib/opencv/build/lib/") +#message(status "opencv libs: ${OpenCV_LIBS}") +#set(OpenCV_LIBS "/home/dfried/lib/opencv/build/lib/") +find_package(PkgConfig REQUIRED) + +pkg_check_modules(JSONCPP REQUIRED jsoncpp) + +if(OSMESA_RENDERING) + pkg_check_modules(OSMESA REQUIRED osmesa) + set(GL_LIBS ${OSMESA_LIBRARIES}) +else() + #set(OPENGL_LIBRARIES "/usr/include/GL") + #set(GLEW_LIBRARIES "/usr/include/GL") + set(GLEW_LIBRARIES "/usr/lib/x86_64-linux-gnu/libGLEW.so") + set(OPENGL_LIBRARIES "/usr/lib/x86_64-linux-gnu/libglut.so") + #find_package(OpenGL REQUIRED) + # find_package(GLEW REQUIRED) + set(GL_LIBS ${OPENGL_LIBRARIES} ${GLEW_LIBRARIES}) +endif() + +add_library(MatterSim SHARED src/lib/MatterSim.cpp src/lib/Benchmark.cpp) +if(OSMESA_RENDERING) + target_compile_definitions(MatterSim PUBLIC "-DOSMESA_RENDERING") +endif() +target_include_directories(MatterSim PRIVATE ${JSONCPP_INCLUDE_DIRS}) +target_link_libraries(MatterSim ${JSONCPP_LIBRARIES} ${OpenCV_LIBS} ${GL_LIBS}) + +add_executable(tests src/test/main.cpp) +target_include_directories(tests PRIVATE ${JSONCPP_INCLUDE_DIRS}) +target_link_libraries(tests MatterSim ${JSONCPP_LIBRARIES} ${OpenCV_LIBS}) + +add_executable(mattersim_main src/driver/mattersim_main.cpp) +target_link_libraries(mattersim_main MatterSim) + +add_executable(random_agent src/driver/random_agent.cpp) +target_link_libraries(random_agent MatterSim) + +#set(PYTHON_EXECUTABLE "/home/dfried/anaconda2/envs/m3d3/bin/python") +#set(PYTHON_EXECUTABLE "/home/dfried/anaconda2/bin/python") +#set(PYTHON_INCLUDE_DIR "/home/dfried/anaconda2/include") +#set(PYTHON_LIBRARY "/home/dfried/anaconda2/lib") + +#find_package(PythonInterp 2.7) +# message(${PYTHON_EXECUTABLE}) + +add_subdirectory(pybind11) + +# +# # Need to search for python executable again to pick up an activated +# # virtualenv python, if any. +# unset(PYTHON_EXECUTABLE CACHE) +# find_program(PYTHON_EXECUTABLE python +# PATHS ENV PATH # look in the PATH environment variable +# NO_DEFAULT_PATH # do not look anywhere else... +# ) +# Make FindNumPy available +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake") +find_package(NumPy REQUIRED) + +pybind11_add_module(MatterSimPython src/lib_python/MatterSimPython.cpp) +target_include_directories(MatterSimPython PRIVATE ${NUMPY_INCLUDES}) +target_link_libraries(MatterSimPython PRIVATE MatterSim) +set_target_properties(MatterSimPython + PROPERTIES + OUTPUT_NAME MatterSim) diff --git a/ss_baselines/savi/dialog/speaker/Doxyfile b/ss_baselines/savi/dialog/speaker/Doxyfile new file mode 100644 index 0000000..9833cee --- /dev/null +++ b/ss_baselines/savi/dialog/speaker/Doxyfile @@ -0,0 +1,2304 @@ +# Doxyfile 1.8.6 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "Matterport3D Simulator" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify an logo or icon that is included in +# the documentation. The maximum height of the logo should not exceed 55 pixels +# and the maximum width should not exceed 200 pixels. Doxygen will copy the logo +# to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = ./doxygen/ + +# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce a +# new page for each member. If set to NO, the documentation of a member will be +# part of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL. For instance to make +# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C +# (default is Fortran), use: inc=Fortran f=C. +# +# Note For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by by putting a % sign in front of the word +# or globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES, then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. When set to YES local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO these classes will be included in the various overviews. This option has +# no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable ( YES) or disable ( NO) the +# todo list. This list is created by putting \todo commands in the +# documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable ( YES) or disable ( NO) the +# test list. This list is created by putting \test commands in the +# documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable ( YES) or disable ( NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable ( YES) or disable ( NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES the list +# will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. Do not use file names with spaces, bibtex cannot handle them. See +# also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error ( stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES, then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO doxygen will only warn about wrong or incomplete parameter +# documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. +# Note: If this tag is empty the current directory is searched. + +INPUT = ./include \ + ./src + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank the +# following patterns are tested:*.c, *.cc, *.cxx, *.cpp, *.c++, *.java, *.ii, +# *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, +# *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, +# *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf, +# *.qsf, *.as and *.js. + +FILE_PATTERNS = + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = include/Catch.hpp + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER ) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES, then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional user- +# defined cascading style sheet that is included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefor more robust against future updates. +# Doxygen will copy the style sheet file to the output directory. For an example +# see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the stylesheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to NO can help when comparing the output of multiple runs. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler ( hhc.exe). If non-empty +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated ( +# YES) or that it should be included in the master .chm file ( NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index ( hhk), content ( hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated ( +# YES) or a normal table of contents ( NO) in the .chm file. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom stylesheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using prerendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /