added webscrapper

longevity-genie · Nov 5, 2024 · c60af54 · c60af54
1 parent 39eaf1d
commit c60af54
Show file tree

Hide file tree

Showing 11 changed files with 272 additions and 7 deletions.
diff --git a/.github/workflows/build-websandbox b/.github/workflows/build-websandbox
@@ -0,0 +1,43 @@
+name: build and push websandbox
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - 'just_agents_coding/containers/websandbox/**'
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}/websandbox
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: Log in to the Container registry
+      uses: docker/login-action@v3
+      with:
+        registry: ${{ env.REGISTRY }}
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Extract metadata (tags, labels) for Docker
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+    - name: Build and push Docker image
+      uses: docker/build-push-action@v5
+      with:
+        context: ./just_agents_coding/containers/websandbox
+        push: true
+        tags: ${{ steps.meta.outputs.tags }}
+        labels: ${{ steps.meta.outputs.labels }}
diff --git a/examples/coding/bioinformatic_agent.py b/examples/coding/bioinformatic_agent.py
@@ -13,6 +13,7 @@
 """
 This example shows how to use a Chain Of Thought code agent to run python code and bash commands. 
 It uses volumes (see tools.py) and is based on Chain Of Thought Agent class.
+Note: current example is a work in progress and the task is too complex to get it solved in one go.
 """
 
 if __name__ == "__main__":

diff --git a/examples/coding/tools.py b/examples/coding/tools.py
@@ -137,4 +137,44 @@ def amino_match_endswith(text, ending):
     # Remove newlines from the sequence and check the ending
     sequence = "".join(matches[0].splitlines())
 
-    return sequence.endswith(ending.upper())
+    return sequence.endswith(ending.upper())
+
+
+#TEMPORAL FUNCTIONS for webscrapper, will be removed soon
+
+def execute_bash_command(command: str) -> str:
+    """
+    command: str # command to run in bash, for example install software inside micromamba environment
+    """
+    mounts = make_mounts()
+    try:
+        with MicromambaSession(image="ghcr.io/longevity-genie/just-agents/websandbox:main", 
+                               lang="python", 
+                               keep_template=True, 
+                               verbose=True,
+                               mounts=mounts) as session:
+            result = session.execute_command(command=command)
+            ugly_log(command, output_dir, "bash", "sh")
+            return result
+    except Exception as e:
+        print(f"Error executing bash command: {e}")
+        return str(e)
+
+
+def execute_python_code(code: str) -> str:
+    """
+    code: str # python code to run in micromamba environment
+    """
+    mounts = make_mounts()
+    try:
+        with MicromambaSession(image="ghcr.io/longevity-genie/just-agents/websandbox:main", 
+                               lang="python", 
+                               keep_template=True, 
+                               verbose=True,
+                               mounts=mounts) as session:
+            result = session.run(code)
+            ugly_log(code, output_dir, "code", "py")
+            return result
+    except Exception as e:
+        print(f"Error executing Python code: {e}")
+        return str(e)
diff --git a/examples/coding/webscrapper.py b/examples/coding/webscrapper.py
@@ -0,0 +1,28 @@
+from pathlib import Path
+
+from dotenv import load_dotenv
+from llm_sandbox.micromamba import MicromambaSession
+from llm_sandbox.docker import SandboxDockerSession
+
+from examples.coding.mounts import make_mounts, input_dir, output_dir, coding_examples_dir
+from just_agents.interfaces.IAgent import build_agent, IAgent
+from just_agents.llm_session import LLMSession
+from examples.coding.tools import write_thoughts_and_results, amino_match_endswith
+from examples.coding.mounts import input_dir, output_dir, coding_examples_dir
+
+load_dotenv(override=True)
+
+"""
+This example shows how to use a simple code agent to run python code and bash commands, it does not use volumes and is based on basic LLMSession class.
+"""
+
+if __name__ == "__main__":
+    assistant: LLMSession= build_agent(coding_examples_dir / "webscrapper.yaml") #probably should make seaparate webscrapping agent
+    query = """
+    Here is a list of events from Zelar: https://app.sola.day/event/zelarcity 
+    Investigate the layout of the page and find all the events.
+    Scrape it and save events information in /output/zelar_events.txt 
+    If you get zero events, try to find a way to navigate to subpages and scrape them.
+    """
+    result, thoughts = assistant.query(query)
+    write_thoughts_and_results("zelar_events", thoughts, result)
diff --git a/examples/coding/webscrapper.yaml b/examples/coding/webscrapper.yaml
@@ -0,0 +1,118 @@
+class: "ChainOfThoughtAgent"
+system_prompt: "You are a web scraping AI assistant. 
+Your role is to help with web scraping tasks and generate plans or code as needed. 
+Please adhere to the following guidelines strictly:
+1. Always maintain your role as a web scraping specialist.
+2. You are working on an Ubuntu 24.04 system with base micromamba environment.yaml file, which is:
+```yaml
+name: base
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.11
+  - requests
+  - beautifulsoup4
+  - selenium
+  - playwright
+  - scrapy
+  - pandas>=2.2.2
+  - numpy<2.0.0,>=1.23
+  - pyarrow
+  - pip:
+      - fake-useragent
+      - cloudscraper
+```
+However no other software is installed by default.
+3. You use execute_bash_command tool to install new dependencies. You do not need to activate base micromamba environment, it is already preactivated when you run commands.
+4. Use execute_python_code tool to run python code. The code will be run in the base micromamba environment.
+5. Use information provided in the input to write detailed plans or code to accomplish the given scraping task.
+6. When scraping:
+   - Always implement retry logic with exponential backoff
+   - Rotate user agents to avoid detection
+   - Check if the resulting data is empty or incomplete
+   - If a scraping attempt fails, try alternative methods in this order:
+     a. Basic requests with headers
+     b. Cloudscraper for cloudflare bypass
+     c. Selenium/Playwright for JavaScript rendering
+   - Implement proper error handling and logging
+   - Respect robots.txt and implement rate limiting
+   - Save raw HTML responses before parsing in case of errors
+   - Validate scraped data structure and completeness
+   - If data is empty or invalid, try different selectors or XPaths
+
+7. When writing code:
+   - Use full absolute paths for all files
+   - Install dependencies using micromamba or pip with the -y flag
+   - Always give all relevant imports at the beginning
+   - Save scraped data in the /output directory
+   - Implement proper logging to /output/logs
+   - Add comments explaining the scraping strategy
+   - Include error handling for network issues
+   - Validate output data before saving
+
+8. If execution errors occur:
+   - Check network connectivity
+   - Verify selectors/XPaths still valid
+   - Try alternative scraping methods
+   - Inspect any anti-bot protection
+   - Log failed attempts and errors
+   - Implement circuit breaker pattern for repeated failures
+
+9. Pay attention to the number of input files and do not miss any.
+10. Do not create or activate the micromamba environment 'base', it is already activated by default.
+11. Be aware of file name changes or outputs from previous steps when provided with history.
+12. If execution errors occur, fix the code based on the error information provided.
+13. When you are ready to give the final answer, explain the results obtained and files and folders created in the /output (if any).
+14. Examples of using GEOparse to download and process GEO data:
+```python
+import GEOparse
+
+gse = GEOparse.get_GEO('GSE176043', destdir='/input')
+```
+
+System constraints:
+- You are working on an Ubuntu 24.04 system.
+- You have a micromamba environment named 'base'.
+- No other software is installed by default.
+Remember to adapt your response based on whether you're creating an initial plan or writing code for a specific task. 
+Your goal is to provide accurate, efficient, and executable bioinformatics solutions.
+ 
+For each step, provide a title that describes what you're doing in that step, along with the content. 
+Decide if you need another step or if you're ready to give the final answer. 
+Respond in JSON format with 'title', 'code', 'content', and 'next_action' (either 'continue' or 'final_answer') keys.
+Make sure you send only one JSON step object.
+USE AS MANY REASONING STEPS AS POSSIBLE. AT LEAST 3. 
+BE AWARE OF YOUR LIMITATIONS AS AN LLM AND WHAT YOU CAN AND CANNOT DO. 
+IN YOUR REASONING, INCLUDE EXPLORATION OF ALTERNATIVE ANSWERS. 
+CONSIDER YOU MAY BE WRONG, AND IF YOU ARE WRONG IN YOUR REASONING, WHERE IT WOULD BE. 
+FULLY TEST ALL OTHER POSSIBILITIES. 
+YOU CAN BE WRONG. WHEN YOU SAY YOU ARE RE-EXAMINING, ACTUALLY RE-EXAMINE, AND USE ANOTHER APPROACH TO DO SO. 
+DO NOT JUST SAY YOU ARE RE-EXAMINING. USE AT LEAST 3 METHODS TO DERIVE THE ANSWER. USE BEST PRACTICES.
+
+  Example of a valid JSON response:
+  ```json
+  {
+      'title': 'Identifying Key Information',
+      'content': 'To begin solving this problem, we need to carefully examine the given information and identify the crucial elements that will guide our solution process. This involves...',
+      'next_action': 'continue'
+  }```
+  "
+system_prompt_path:
+final_prompt: "Please provide the final answer based solely on your reasoning above."
+title: "title"
+content: "content"
+next_action: "next_action"
+action_continue: "continue"
+action_final: "final_answer"
+thought_max_tokes: 500
+max_steps: 25
+final_max_tokens: 1500
+tools:
+  - package: "examples.coding.tools"
+    function: "execute_bash_command"
+  - package: "examples.coding.tools"
+    function: "execute_python_code"
+options:
+  model: "gpt-4o-mini"
+  temperature: 0.0
diff --git a/examples/multiagent/glucose_dao_reflection.py b/examples/multiagent/glucose_dao_reflection.py
@@ -25,7 +25,7 @@ def planning_example():
 
     planner: ChatAgent = ChatAgent(llm_options = llm_options.OPENAI_GPT4oMINI, role = "You are a helpful adviser that helps people to create their NGOs and DAOs",
                                    goal = "Your goal is to help the user to make the best possible action plan to create her NGO or DAO.",
-                                   task="Create the best actionable plan possible, take into account the feedback and suggestions, improve it until it is perfect.")
+                                   task="Create the best actionable plan possible while being realististic knowing limitations of the time and resources of the founder and current state of the art, take into account the feedback and suggestions, improve it until it is perfect.",)
     reviewer: ChatAgent = ChatAgent(llm_options = llm_options.OPENAI_GPT4oMINI,
                                       role = "you represent the interests of the DAO or nonprofit creator and provide feedback and suggestions for the plan which is generated for you",
                                       goal="provide the best feedback ever and ask for specific improvements",

diff --git a/just_agents_coding/containers/biosandbox/Dockerfile b/just_agents_coding/containers/biosandbox/Dockerfile
@@ -5,9 +5,9 @@ RUN apt update && \
     apt upgrade -y && \
     apt install -y tar gzip libz-dev software-properties-common python3-software-properties automake nano cmake zip wget gcc git build-essential curl gosu libbz2-dev zlib1g-dev gawk libxml2-dev
 
-RUN mkdir /input && \
+RUN mkdir -p /input && \
     chown -R $MAMBA_USER:$MAMBA_USER /input
-RUN mkdir /output && \
+RUN mkdir -p /output && \
     chown -R $MAMBA_USER:$MAMBA_USER /output
 
 USER $MAMBA_USER

diff --git a/just_agents_coding/containers/sandbox/Dockerfile b/just_agents_coding/containers/sandbox/Dockerfile
@@ -5,12 +5,12 @@ RUN apt update && \
     apt upgrade -y && \
     apt install -y tar gzip libz-dev software-properties-common python3-software-properties automake nano cmake zip wget gcc git build-essential curl gosu libbz2-dev zlib1g-dev gawk libxml2-dev
 
-RUN mkdir /input && \
+RUN mkdir -p /input && \
     chown -R $MAMBA_USER:$MAMBA_USER /input
-RUN mkdir /output && \
+RUN mkdir -p /output && \
     chown -R $MAMBA_USER:$MAMBA_USER /output
 
 USER $MAMBA_USER
 COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml
 RUN micromamba install -y -n base -f /tmp/env.yaml && \
-    micromamba clean --all --yes
+    micromamba clean --all --yes
diff --git a/just_agents_coding/containers/websandbox/Dockerfile b/just_agents_coding/containers/websandbox/Dockerfile
@@ -0,0 +1,16 @@
+FROM mambaorg/micromamba:latest
+
+USER root
+RUN apt update && \
+    apt upgrade -y && \
+    apt install -y tar gzip libz-dev software-properties-common python3-software-properties automake nano cmake zip wget gcc git build-essential curl gosu libbz2-dev zlib1g-dev gawk libxml2-dev
+
+RUN mkdir -p /input && \
+    chown -R $MAMBA_USER:$MAMBA_USER /input
+RUN mkdir -p /output && \
+    chown -R $MAMBA_USER:$MAMBA_USER /output
+
+USER $MAMBA_USER
+COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml
+RUN micromamba install -y -n base -f /tmp/env.yaml && \
+    micromamba clean --all --yes
diff --git a/just_agents_coding/containers/websandbox/build.sh b/just_agents_coding/containers/websandbox/build.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker build -t ghcr.io/longevity-genie/just-agents/websandbox:main .
diff --git a/just_agents_coding/containers/websandbox/env.yaml b/just_agents_coding/containers/websandbox/env.yaml
@@ -0,0 +1,17 @@
+name: base
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.11
+  - requests
+  - beautifulsoup4
+  - selenium
+  - playwright
+  - scrapy
+  - pandas>=2.2.2
+  - numpy<2.0.0,>=1.23
+  - pyarrow
+  - pip:
+      - fake-useragent
+      - cloudscraper
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/bash
		docker build -t ghcr.io/longevity-genie/just-agents/websandbox:main .