From c09314c54603b9609d472c080a8a5980e306a4ad Mon Sep 17 00:00:00 2001
From: Xinrun Xu <v.xinrun@gmail.com>
Date: Mon, 8 Jul 2024 18:01:52 +0800
Subject: [PATCH] Refine software environment (#52)

* Refine software environment
* Update software.md
---
 .../rdr2/composite_skills/auto_shoot.py       | 14 ++++---
 .../rdr2/composite_skills/follow.py           |  2 +-
 cradle/environment/skill_registry.py          |  1 -
 cradle/provider/object_detect/gd_provider.py  |  6 ++-
 cradle/utils/image_utils.py                   |  4 +-
 docs/envs/software.md                         | 42 ++++++++++++-------
 requirements.txt                              |  4 ++
 7 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/cradle/environment/rdr2/composite_skills/auto_shoot.py b/cradle/environment/rdr2/composite_skills/auto_shoot.py
index 0af5dd0..dc92806 100644
--- a/cradle/environment/rdr2/composite_skills/auto_shoot.py
+++ b/cradle/environment/rdr2/composite_skills/auto_shoot.py
@@ -6,11 +6,6 @@
 import torch
 from torchvision.ops import box_convert
 
-try:
-    from groundingdino.util.inference import annotate
-except:
-    pass
-
 from cradle.config import Config
 from cradle.log import Logger
 from cradle.gameio.io_env import IOEnvironment
@@ -18,13 +13,20 @@
 from cradle.environment.rdr2.atomic_skills.move import turn
 from cradle.environment.rdr2.skill_registry import register_skill
 from cradle.utils.image_utils import exec_clip_minimap
-from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect
 from cradle import constants
 
 config = Config()
 logger = Logger()
 io_env = IOEnvironment()
 
+if config.is_game == True:
+    try:
+        from groundingdino.util.inference import annotate
+    except:
+        pass
+
+    from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect
+
 DEFAULT_MAX_SHOOTING_ITERATIONS = 100
 SHOOT_PEOPLE_TARGET_NAME = "person"
 SHOOT_WOLVES_TARGET_NAME = "wolf"
diff --git a/cradle/environment/rdr2/composite_skills/follow.py b/cradle/environment/rdr2/composite_skills/follow.py
index 9742015..f9d6ae2 100644
--- a/cradle/environment/rdr2/composite_skills/follow.py
+++ b/cradle/environment/rdr2/composite_skills/follow.py
@@ -10,7 +10,7 @@
 from cradle.environment.rdr2.atomic_skills.move import turn, move_forward
 from cradle.environment.rdr2.skill_registry import register_skill
 from cradle.utils.image_utils import exec_clip_minimap
-from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect
+from cradle.utils.object_utils import circle_detector_detect
 from cradle import constants
 
 config = Config()
diff --git a/cradle/environment/skill_registry.py b/cradle/environment/skill_registry.py
index e723823..8859056 100644
--- a/cradle/environment/skill_registry.py
+++ b/cradle/environment/skill_registry.py
@@ -20,7 +20,6 @@
 from cradle.utils.check import is_valid_value
 from cradle.gameio.io_env import IOEnvironment
 from cradle.constants import *
-from cradle.utils.object_utils import groundingdino_detect, circle_detector_detect
 
 
 config = Config()
diff --git a/cradle/provider/object_detect/gd_provider.py b/cradle/provider/object_detect/gd_provider.py
index 6b618f6..247382e 100644
--- a/cradle/provider/object_detect/gd_provider.py
+++ b/cradle/provider/object_detect/gd_provider.py
@@ -3,12 +3,16 @@
 from cradle.provider import BaseProvider
 from cradle import constants
 from cradle.log import Logger
+from cradle.config import Config
 from cradle.memory import LocalMemory
-from cradle.utils.object_utils import groundingdino_detect
 
+config = Config()
 logger = Logger()
 memory = LocalMemory()
 
+if config.is_game == True:
+    from cradle.utils.object_utils import groundingdino_detect
+
 
 class GdProvider(BaseProvider):
 
diff --git a/cradle/utils/image_utils.py b/cradle/utils/image_utils.py
index 8400922..421d5b5 100644
--- a/cradle/utils/image_utils.py
+++ b/cradle/utils/image_utils.py
@@ -20,13 +20,15 @@
 from cradle.config import Config
 from cradle.gameio import IOEnvironment
 from cradle.log import Logger
-from cradle.utils.object_utils import groundingdino_detect
 from cradle import constants
 
 config = Config()
 io_env = IOEnvironment()
 logger = Logger()
 
+if config.is_game == True:
+    from cradle.utils.object_utils import groundingdino_detect
+
 
 def show_image(img):
 
diff --git a/docs/envs/software.md b/docs/envs/software.md
index 94e0a40..fc8c931 100644
--- a/docs/envs/software.md
+++ b/docs/envs/software.md
@@ -1,9 +1,24 @@
-Here are the settings for Software side.
-
 ## Software Setup
 
 ### 1. Install Software Dependencies
 
+**Install Segment Anything Model (SAM)**
+
+On Windows install from https://developer.nvidia.com/cuda-11-8-0-download-archive (Linux packages also available).
+
+Ensure pytorch is installed using the right CUDA dependencies.
+
+```bash
+conda install pytorch torchvision cudatoolkit=11.8 -c nvidia -c pytorch
+```
+
+If this doesn't work, or you prefer the pip way, you can try something like:
+
+```bash
+pip3 install --upgrade torch==2.1.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html
+pip3 install torchvision==0.16.1+cu118 -f https://download.pytorch.org/whl/torch_stable.html
+```
+
 Download the [StableSAM](https://huggingface.co/spaces/abhishek/StableSAM/blob/main/sam_vit_h_4b8939.pth) model file and copy it to the `/cache` folder.
 
 ### 2. Change Computer Settings Before Running the Code
@@ -16,9 +31,15 @@ Then, set the folder that the agent will open to display in Large icons or Extra
 
 ![Large icons](../envs/images/software/large_icon.png)
 
-### 3. Open the software you want to test
+### 3. Open the software and task you want to run
+Cradle is mainly tested on Chrome, Outlook, Capcut, Meitu and Feishu. Theoretically, it can also be run on other software applications.
+
+#### 3.1 Follow [25 Tasks Provided](#25-tasks-provided) to choose the software and task you want to run
+Change the task `task_id` in `cradle/runner/app_runner.py` according to the description in `cradle/conf/env_config_[env_name].json` to switch among tasks.
 
-Below are the exact software versions utilized in our paper:
+#### 3.2 Follow [Initial Stage for Each Software](#initial-stage-for-each-software) to open the software 
+
+Below are the exact software versions utilized:
 
 | Software | Version |
 | -------- | ------- |
@@ -33,7 +54,7 @@ In theory, any version can be used. However, if you want to reproduce our experi
 ### 4. Run
 
 To simplify operations, the default LLM model we use is OpenAI's `GPT-4o`.
-After opening the corresponding software in your main screen, use the follow script to let Cradle run.
+After opening the corresponding software on your main screen, use the following script to let Cradle run.
 
 ```bash
 # Run Chrome
@@ -48,9 +69,7 @@ python runner.py --envConfig "./conf/env_config_xiuxiu.json"
 python runner.py --envConfig "./conf/env_config_feishu.json"
 ```
 
-Or if you want use debug mode, you need to change the `--envConfig` target in `.vscode\launch.json` to the software's JSON file in the `conf\` directory that you want to test.
-
-## 25 Tasks in our Paper
+## 25 Tasks Provided
 
 Task Descriptions for Chrome, Outlook, CapCut, Meitu and Feishu. **Difficulty** refers to how hard it is for our agent to accomplish the corresponding tasks.
 
@@ -87,7 +106,7 @@ Task Descriptions for Chrome, Outlook, CapCut, Meitu and Feishu. **Difficulty**
 | #4 Set User Status| Open the user profile menu and set my status to "In meeting". | Medium |
 | #5 Start Video Conference | Create a new meeting and meet now. | Easy |
 
-## Initial Stage for Every Software
+## Initial Stage for Each Software
 
 ### 1. Chrome
 
@@ -154,8 +173,3 @@ For each task in Feishu, the initial page is shown in the figure:
 
    - Copy the `cradle\environment\chrome` folder located in `cradle\environment\` and rename it to match your software environment name. Replace all instances of "chrome" within the folder with your software's environment name.
    - Copy the `res\chrome` folder located in `res\` and rename it to your software environment name. Replace all instances of "chrome" within the folder with your software's environment name. Modify the prompts and template-matching icon images as needed for important UI elements that SAM2SOM cannot recognize.
-
-4. Debug and Terminal Modes:
-
-    - Debug Mode: Change the `--envConfig` target in `.vscode\launch.json` to point to the software's JSON file in the `conf\` directory that you want to test.
-    - Terminal Mode: Pass the `--envConfig` argument to the software's JSON file in the `conf\` directory that you want to test.
diff --git a/requirements.txt b/requirements.txt
index ff78e2c..1500f74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
+ahk==1.7.6
+ahk-binary==2023.9.0
 backoff==2.2.1
 openai==1.2.3
 python-dotenv==1.0.0
@@ -16,6 +18,8 @@ aiohttp
 easyocr==1.7.1
 spacy==3.7.2
 chardet==5.2.0
+matplotlib==3.9.1
+supervision==0.21.0
 pyobjc-framework-Quartz==10.0; sys_platform == "darwin"
 pyobjc-framework-Cocoa==10.0; sys_platform == "darwin"
 git+https://github.com/facebookresearch/segment-anything.git