Skip to content

Commit

Permalink
added webscrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
antonkulaga committed Nov 5, 2024
1 parent 39eaf1d commit c60af54
Show file tree
Hide file tree
Showing 11 changed files with 272 additions and 7 deletions.
43 changes: 43 additions & 0 deletions .github/workflows/build-websandbox
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: build and push websandbox

on:
push:
branches: [ main ]
paths:
- 'just_agents_coding/containers/websandbox/**'

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}/websandbox

jobs:
build-and-push:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata (tags, labels) for Docker
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}

- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: ./just_agents_coding/containers/websandbox
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
1 change: 1 addition & 0 deletions examples/coding/bioinformatic_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
"""
This example shows how to use a Chain Of Thought code agent to run python code and bash commands.
It uses volumes (see tools.py) and is based on Chain Of Thought Agent class.
Note: current example is a work in progress and the task is too complex to get it solved in one go.
"""

if __name__ == "__main__":
Expand Down
42 changes: 41 additions & 1 deletion examples/coding/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,44 @@ def amino_match_endswith(text, ending):
# Remove newlines from the sequence and check the ending
sequence = "".join(matches[0].splitlines())

return sequence.endswith(ending.upper())
return sequence.endswith(ending.upper())


#TEMPORAL FUNCTIONS for webscrapper, will be removed soon

def execute_bash_command(command: str) -> str:
"""
command: str # command to run in bash, for example install software inside micromamba environment
"""
mounts = make_mounts()
try:
with MicromambaSession(image="ghcr.io/longevity-genie/just-agents/websandbox:main",
lang="python",
keep_template=True,
verbose=True,
mounts=mounts) as session:
result = session.execute_command(command=command)
ugly_log(command, output_dir, "bash", "sh")
return result
except Exception as e:
print(f"Error executing bash command: {e}")
return str(e)


def execute_python_code(code: str) -> str:
"""
code: str # python code to run in micromamba environment
"""
mounts = make_mounts()
try:
with MicromambaSession(image="ghcr.io/longevity-genie/just-agents/websandbox:main",
lang="python",
keep_template=True,
verbose=True,
mounts=mounts) as session:
result = session.run(code)
ugly_log(code, output_dir, "code", "py")
return result
except Exception as e:
print(f"Error executing Python code: {e}")
return str(e)
28 changes: 28 additions & 0 deletions examples/coding/webscrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from pathlib import Path

from dotenv import load_dotenv
from llm_sandbox.micromamba import MicromambaSession
from llm_sandbox.docker import SandboxDockerSession

from examples.coding.mounts import make_mounts, input_dir, output_dir, coding_examples_dir
from just_agents.interfaces.IAgent import build_agent, IAgent
from just_agents.llm_session import LLMSession
from examples.coding.tools import write_thoughts_and_results, amino_match_endswith
from examples.coding.mounts import input_dir, output_dir, coding_examples_dir

load_dotenv(override=True)

"""
This example shows how to use a simple code agent to run python code and bash commands, it does not use volumes and is based on basic LLMSession class.
"""

if __name__ == "__main__":
assistant: LLMSession= build_agent(coding_examples_dir / "webscrapper.yaml") #probably should make seaparate webscrapping agent
query = """
Here is a list of events from Zelar: https://app.sola.day/event/zelarcity
Investigate the layout of the page and find all the events.
Scrape it and save events information in /output/zelar_events.txt
If you get zero events, try to find a way to navigate to subpages and scrape them.
"""
result, thoughts = assistant.query(query)
write_thoughts_and_results("zelar_events", thoughts, result)
118 changes: 118 additions & 0 deletions examples/coding/webscrapper.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
class: "ChainOfThoughtAgent"
system_prompt: "You are a web scraping AI assistant.
Your role is to help with web scraping tasks and generate plans or code as needed.
Please adhere to the following guidelines strictly:
1. Always maintain your role as a web scraping specialist.
2. You are working on an Ubuntu 24.04 system with base micromamba environment.yaml file, which is:
```yaml
name: base
channels:
- conda-forge
- defaults
dependencies:
- python=3.11
- requests
- beautifulsoup4
- selenium
- playwright
- scrapy
- pandas>=2.2.2
- numpy<2.0.0,>=1.23
- pyarrow
- pip:
- fake-useragent
- cloudscraper
```
However no other software is installed by default.
3. You use execute_bash_command tool to install new dependencies. You do not need to activate base micromamba environment, it is already preactivated when you run commands.
4. Use execute_python_code tool to run python code. The code will be run in the base micromamba environment.
5. Use information provided in the input to write detailed plans or code to accomplish the given scraping task.
6. When scraping:
- Always implement retry logic with exponential backoff
- Rotate user agents to avoid detection
- Check if the resulting data is empty or incomplete
- If a scraping attempt fails, try alternative methods in this order:
a. Basic requests with headers
b. Cloudscraper for cloudflare bypass
c. Selenium/Playwright for JavaScript rendering
- Implement proper error handling and logging
- Respect robots.txt and implement rate limiting
- Save raw HTML responses before parsing in case of errors
- Validate scraped data structure and completeness
- If data is empty or invalid, try different selectors or XPaths
7. When writing code:
- Use full absolute paths for all files
- Install dependencies using micromamba or pip with the -y flag
- Always give all relevant imports at the beginning
- Save scraped data in the /output directory
- Implement proper logging to /output/logs
- Add comments explaining the scraping strategy
- Include error handling for network issues
- Validate output data before saving
8. If execution errors occur:
- Check network connectivity
- Verify selectors/XPaths still valid
- Try alternative scraping methods
- Inspect any anti-bot protection
- Log failed attempts and errors
- Implement circuit breaker pattern for repeated failures
9. Pay attention to the number of input files and do not miss any.
10. Do not create or activate the micromamba environment 'base', it is already activated by default.
11. Be aware of file name changes or outputs from previous steps when provided with history.
12. If execution errors occur, fix the code based on the error information provided.
13. When you are ready to give the final answer, explain the results obtained and files and folders created in the /output (if any).
14. Examples of using GEOparse to download and process GEO data:
```python
import GEOparse
gse = GEOparse.get_GEO('GSE176043', destdir='/input')
```
System constraints:
- You are working on an Ubuntu 24.04 system.
- You have a micromamba environment named 'base'.
- No other software is installed by default.
Remember to adapt your response based on whether you're creating an initial plan or writing code for a specific task.
Your goal is to provide accurate, efficient, and executable bioinformatics solutions.
For each step, provide a title that describes what you're doing in that step, along with the content.
Decide if you need another step or if you're ready to give the final answer.
Respond in JSON format with 'title', 'code', 'content', and 'next_action' (either 'continue' or 'final_answer') keys.
Make sure you send only one JSON step object.
USE AS MANY REASONING STEPS AS POSSIBLE. AT LEAST 3.
BE AWARE OF YOUR LIMITATIONS AS AN LLM AND WHAT YOU CAN AND CANNOT DO.
IN YOUR REASONING, INCLUDE EXPLORATION OF ALTERNATIVE ANSWERS.
CONSIDER YOU MAY BE WRONG, AND IF YOU ARE WRONG IN YOUR REASONING, WHERE IT WOULD BE.
FULLY TEST ALL OTHER POSSIBILITIES.
YOU CAN BE WRONG. WHEN YOU SAY YOU ARE RE-EXAMINING, ACTUALLY RE-EXAMINE, AND USE ANOTHER APPROACH TO DO SO.
DO NOT JUST SAY YOU ARE RE-EXAMINING. USE AT LEAST 3 METHODS TO DERIVE THE ANSWER. USE BEST PRACTICES.
Example of a valid JSON response:
```json
{
'title': 'Identifying Key Information',
'content': 'To begin solving this problem, we need to carefully examine the given information and identify the crucial elements that will guide our solution process. This involves...',
'next_action': 'continue'
}```
"
system_prompt_path:
final_prompt: "Please provide the final answer based solely on your reasoning above."
title: "title"
content: "content"
next_action: "next_action"
action_continue: "continue"
action_final: "final_answer"
thought_max_tokes: 500
max_steps: 25
final_max_tokens: 1500
tools:
- package: "examples.coding.tools"
function: "execute_bash_command"
- package: "examples.coding.tools"
function: "execute_python_code"
options:
model: "gpt-4o-mini"
temperature: 0.0
2 changes: 1 addition & 1 deletion examples/multiagent/glucose_dao_reflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def planning_example():

planner: ChatAgent = ChatAgent(llm_options = llm_options.OPENAI_GPT4oMINI, role = "You are a helpful adviser that helps people to create their NGOs and DAOs",
goal = "Your goal is to help the user to make the best possible action plan to create her NGO or DAO.",
task="Create the best actionable plan possible, take into account the feedback and suggestions, improve it until it is perfect.")
task="Create the best actionable plan possible while being realististic knowing limitations of the time and resources of the founder and current state of the art, take into account the feedback and suggestions, improve it until it is perfect.",)
reviewer: ChatAgent = ChatAgent(llm_options = llm_options.OPENAI_GPT4oMINI,
role = "you represent the interests of the DAO or nonprofit creator and provide feedback and suggestions for the plan which is generated for you",
goal="provide the best feedback ever and ask for specific improvements",
Expand Down
4 changes: 2 additions & 2 deletions just_agents_coding/containers/biosandbox/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ RUN apt update && \
apt upgrade -y && \
apt install -y tar gzip libz-dev software-properties-common python3-software-properties automake nano cmake zip wget gcc git build-essential curl gosu libbz2-dev zlib1g-dev gawk libxml2-dev

RUN mkdir /input && \
RUN mkdir -p /input && \
chown -R $MAMBA_USER:$MAMBA_USER /input
RUN mkdir /output && \
RUN mkdir -p /output && \
chown -R $MAMBA_USER:$MAMBA_USER /output

USER $MAMBA_USER
Expand Down
6 changes: 3 additions & 3 deletions just_agents_coding/containers/sandbox/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ RUN apt update && \
apt upgrade -y && \
apt install -y tar gzip libz-dev software-properties-common python3-software-properties automake nano cmake zip wget gcc git build-essential curl gosu libbz2-dev zlib1g-dev gawk libxml2-dev

RUN mkdir /input && \
RUN mkdir -p /input && \
chown -R $MAMBA_USER:$MAMBA_USER /input
RUN mkdir /output && \
RUN mkdir -p /output && \
chown -R $MAMBA_USER:$MAMBA_USER /output

USER $MAMBA_USER
COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml
RUN micromamba install -y -n base -f /tmp/env.yaml && \
micromamba clean --all --yes
micromamba clean --all --yes
16 changes: 16 additions & 0 deletions just_agents_coding/containers/websandbox/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM mambaorg/micromamba:latest

USER root
RUN apt update && \
apt upgrade -y && \
apt install -y tar gzip libz-dev software-properties-common python3-software-properties automake nano cmake zip wget gcc git build-essential curl gosu libbz2-dev zlib1g-dev gawk libxml2-dev

RUN mkdir -p /input && \
chown -R $MAMBA_USER:$MAMBA_USER /input
RUN mkdir -p /output && \
chown -R $MAMBA_USER:$MAMBA_USER /output

USER $MAMBA_USER
COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml /tmp/env.yaml
RUN micromamba install -y -n base -f /tmp/env.yaml && \
micromamba clean --all --yes
2 changes: 2 additions & 0 deletions just_agents_coding/containers/websandbox/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker build -t ghcr.io/longevity-genie/just-agents/websandbox:main .
17 changes: 17 additions & 0 deletions just_agents_coding/containers/websandbox/env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: base
channels:
- conda-forge
- defaults
dependencies:
- python=3.11
- requests
- beautifulsoup4
- selenium
- playwright
- scrapy
- pandas>=2.2.2
- numpy<2.0.0,>=1.23
- pyarrow
- pip:
- fake-useragent
- cloudscraper

0 comments on commit c60af54

Please sign in to comment.