Skip to content

Commit

Permalink
Merge feature/pegasus-scraping-enhancements
Browse files Browse the repository at this point in the history
  • Loading branch information
Sunwood-ai-labs committed Jun 9, 2024
2 parents 56e837d + 2a08419 commit 560914f
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 91 deletions.
6 changes: 5 additions & 1 deletion .SourceSageignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,8 @@ tests
template
aira.egg-info
aira.Gaiah.md
README_template.md
README_template.md
output
.harmon_ai
pegasus_surf.egg-info
.aira
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -167,4 +167,5 @@ tmp2.md
.SourceSageAssets
.aira/aira.Gaiah.md
.harmon_ai/README_template.md
output
output
urls.txt
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ pegasus は、ウェブサイトを再帰的にクロールし、そのコンテ
pip を使用して pegasus をインストールします。

```shell
pip install pegasus
pip install pegasus-surf
```

## 使い方
Expand All @@ -53,7 +53,18 @@ pegasus をコマンドラインから使用するには、以下のようなコ

```shell
pegasus https://example.com/start-page output_directory --exclude-selectors header footer nav --include-domain example.com --exclude-keywords login --output-extension txt
pegasus https://docs.eraser.io/docs/what-is-eraser output/eraser_docs --exclude-selectors header footer nav aside .sidebar .header .footer .navigation .breadcrumbs --include-domain docs.eraser.io --exclude-keywords login --output-extension .txt

pegasus --base-url https://docs.eraser.io/docs/what-is-eraser output/eraser_docs --exclude-selectors header footer nav aside .sidebar .header .footer .navigation .breadcrumbs --include-domain docs.eraser.io --exclude-keywords login --output-extension .txt

# 深度を指定して実行
pegasus --base-url https://docs.eraser.io/docs/what-is-eraser output/eraser_docs2 --exclude-selectors header footer nav aside .sidebar .header .footer .navigation .breadcrumbs --include-domain docs.eraser.io --exclude-keywords login --output-extension .txt --max-depth 2

# URLが記載されたテキストファイルを指定して実行
pegasus --url-file urls.txt output/roomba --exclude-selectors header footer nav aside .sidebar .header .footer .navigation .breadcrumbs --exclude-keywords login --output-extension .txt --max-depth 1

# LLMを使った仕分け
pegasus --url-file urls.txt output/roomba2 --exclude-selectors header footer nav aside .sidebar .header .footer .navigation .breadcrumbs --exclude-keywords login --output-extension .txt --max-depth 1 --system-message "あなたは、与えられたウェブサイトのコンテンツが特定のトピックに関連する有用な情報を含んでいるかどうかを判断するアシスタントです。トピックに関連する有益な情報が含まれている場合は「True」、そうでない場合は「False」と回答してください。" --classification-prompt "次のウェブサイトのコンテンツは、Roomba APIやiRobotに関する有益な情報を提供していますか? 提供している場合は「True」、そうでない場合は「False」と回答してください。"

```

- `https://example.com/start-page`: クロールを開始するベース URL を指定します。
Expand Down
224 changes: 149 additions & 75 deletions pegasus/Pegasus.py
Original file line number Diff line number Diff line change
@@ -1,87 +1,161 @@
# pegasus/pegasus.py
import requests
import markdownify
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import re
import loguru
import time
from art import *
from litellm import completion
from tqdm import tqdm
import litellm
# litellm.set_verbose=True

logger = loguru.logger

class Pegasus:
def __init__(self, base_url, output_dir, exclude_selectors=None, include_domain=None, exclude_keywords=None, output_extension=".md", dust_size=1000):
self.base_url = base_url
self.output_dir = output_dir
self.exclude_selectors = exclude_selectors
self.include_domain = include_domain
self.exclude_keywords = exclude_keywords
self.visited_urls = set()
self.output_extension = output_extension
self.dust_size = dust_size
tprint(" Pegasus ", font="rnd-xlarge")
logger.info("初期化パラメータ:")
logger.info(f" base_url: {base_url}")
logger.info(f" output_dir: {output_dir}")
logger.info(f" exclude_selectors: {exclude_selectors}")
logger.info(f" include_domain: {include_domain}")
logger.info(f" exclude_keywords: {exclude_keywords}")
logger.info(f" output_extension: {output_extension}")
logger.info(f" dust_size: {dust_size}")

def download_and_convert(self, url):
os.makedirs(self.output_dir, exist_ok=True)
if url in self.visited_urls:
return
self.visited_urls.add(url)

try:
response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

if self.exclude_selectors:
for selector in self.exclude_selectors:
for element in soup.select(selector):
element.decompose()

markdown_content = markdownify.markdownify(str(soup))
markdown_content = re.sub(r'\n{5,}', '\n\n\n\n', markdown_content)

parsed_url = urlparse(url)
output_file = f"{self.output_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}"

if len(markdown_content) < self.dust_size:
dust_dir = os.path.join(self.output_dir, "dust")
os.makedirs(dust_dir, exist_ok=True)
output_file = f"{dust_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}"

with open(output_file, 'w', encoding='utf-8') as file:
file.write(markdown_content)

logger.info(f"変換成功: {url} ---> {output_file} [{len(markdown_content)/1000}kb]")

soup_url = BeautifulSoup(response.text, 'html.parser')

for link in soup_url.find_all('a'):
href = link.get('href')
if href:
absolute_url = urljoin(url, href)
if self.include_domain and self.include_domain in absolute_url:
if self.exclude_keywords:
if any(keyword in absolute_url for keyword in self.exclude_keywords):
continue
absolute_url = absolute_url.split('#')[0]
self.download_and_convert(absolute_url)

except requests.exceptions.RequestException as e:
logger.error(f"ダウンロードエラー: {url}: {e}")
except IOError as e:
logger.error(f"書き込みエラー: {output_file}: {e}")

def run(self):
logger.info(f"スクレイピング開始: base_url={self.base_url}")
self.download_and_convert(self.base_url)
logger.info("スクレイピング完了")
def __init__(self, output_dir, exclude_selectors=None, include_domain=None, exclude_keywords=None, output_extension=".md", dust_size=1000, max_depth=None, system_message=None, classification_prompt=None, max_retries=3):
self.output_dir = output_dir
self.exclude_selectors = exclude_selectors
self.include_domain = include_domain
self.exclude_keywords = exclude_keywords
self.visited_urls = set()
self.output_extension = output_extension
self.dust_size = dust_size
self.max_depth = max_depth
self.domain_summaries = {}
self.system_message = system_message
self.classification_prompt = classification_prompt
self.max_retries = max_retries
tprint(" Pegasus ", font="rnd-xlarge")
logger.info("初期化パラメータ:")
logger.info(f" output_dir: {output_dir}")
logger.info(f" exclude_selectors: {exclude_selectors}")
logger.info(f" include_domain: {include_domain}")
logger.info(f" exclude_keywords: {exclude_keywords}")
logger.info(f" output_extension: {output_extension}")
logger.info(f" dust_size: {dust_size}")
logger.info(f" max_depth: {max_depth}")
logger.info(f" system_message: {system_message}")
logger.info(f" classification_prompt: {classification_prompt}")
logger.info(f" max_retries: {max_retries}")

def filter_site(self, markdown_content):
if(self.classification_prompt is None):
return True

retry_count = 0
while retry_count < self.max_retries:
try:
messages = [
{"role": "system", "content": self.system_message},
{"role": "user", "content": f"{self.classification_prompt}\n\n{markdown_content}"}
]
response = completion(
model="gemini/gemini-1.5-pro-latest",
messages=messages
)
content = response.get('choices', [{}])[0].get('message', {}).get('content')
logger.debug(f"content : {content}")
if "true" in content.lower():
return True
elif "false" in content.lower():
return False
else:
raise ValueError("分類結果が曖昧です。")
except Exception as e:
retry_count += 1
logger.warning(f"フィルタリングでエラーが発生しました。リトライします。({retry_count}/{self.max_retries}\nError: {e}")

if "429" in str(e):
sleep_time = 60 # 60秒スリープ
else:
sleep_time = 10 # その他のエラーの場合は10秒スリープ

for _ in tqdm(range(sleep_time), desc="Sleeping", unit="s"):
time.sleep(1)

logger.error(f"フィルタリングに失敗しました。リトライ回数の上限に達しました。({self.max_retries}回)")
return True

def download_and_convert(self, url, depth=0):
if url in self.visited_urls:
return
self.visited_urls.add(url)

try:
response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

if self.exclude_selectors:
for selector in self.exclude_selectors:
for element in soup.select(selector):
element.decompose()

markdown_content = markdownify.markdownify(str(soup))
markdown_content = re.sub(r'\n{5,}', '\n\n\n\n', markdown_content)

if not self.filter_site(markdown_content):
parsed_url = urlparse(url)
domain = parsed_url.netloc
domain_dir = os.path.join(self.output_dir, domain)
os.makedirs(domain_dir, exist_ok=True)
excluded_dir = os.path.join(domain_dir, "excluded")
os.makedirs(excluded_dir, exist_ok=True)
output_file = f"{excluded_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}"
else:
parsed_url = urlparse(url)
domain = parsed_url.netloc
domain_dir = os.path.join(self.output_dir, domain)
os.makedirs(domain_dir, exist_ok=True)

output_file = f"{domain_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}"

if len(markdown_content) < self.dust_size:
dust_dir = os.path.join(domain_dir, "dust")
os.makedirs(dust_dir, exist_ok=True)
output_file = f"{dust_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}"

with open(output_file, 'w', encoding='utf-8') as file:
file.write(markdown_content)

logger.info(f"[{depth}]変換成功: {url} ---> {output_file} [{len(markdown_content)/1000}kb]")

if domain not in self.domain_summaries:
self.domain_summaries[domain] = []
self.domain_summaries[domain].append(f"# {os.path.basename(output_file)}\n\n---\n\n{markdown_content}")

if self.max_depth is None or depth < self.max_depth:
soup_url = BeautifulSoup(response.text, 'html.parser')

for link in soup_url.find_all('a'):
href = link.get('href')
if href:
absolute_url = urljoin(url, href)
if (self.include_domain and self.include_domain in absolute_url) or (self.include_domain == ""):
if self.exclude_keywords:
if any(keyword in absolute_url for keyword in self.exclude_keywords):
continue
absolute_url = absolute_url.split('#')[0]
self.download_and_convert(absolute_url, depth + 1)

except requests.exceptions.RequestException as e:
logger.error(f"ダウンロードエラー: {url}: {e}")
except IOError as e:
logger.error(f"書き込みエラー: {output_file}: {e}")

def create_domain_summaries(self):
for domain, summaries in self.domain_summaries.items():
summary_file = os.path.join(self.output_dir, f"{domain}_summary{self.output_extension}")
with open(summary_file, 'w', encoding='utf-8') as file:
file.write('\n\n'.join(summaries))
logger.info(f"サマリーファイル作成: {summary_file}")

def run(self, base_url):
logger.info(f"スクレイピング開始: base_url={base_url}")
self.download_and_convert(base_url)
self.create_domain_summaries()
logger.info("スクレイピング完了")
40 changes: 29 additions & 11 deletions pegasus/cli.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,47 @@
# pegasus/cli.py
import argparse
from .Pegasus import Pegasus
from dotenv import load_dotenv
load_dotenv(verbose=True)

def main():
parser = argparse.ArgumentParser(description='Pegasus')
parser.add_argument('base_url', help='Base URL to start scraping')
parser.add_argument('output_dir', help='Output directory for markdown files')
parser.add_argument('--exclude-selectors', nargs='+', help='CSS selectors to exclude')
parser.add_argument('--include-domain', help='Domain to include in URL matching')
parser.add_argument('--exclude-keywords', nargs='+', help='Keywords to exclude in URL matching')
parser.add_argument('--output-extension', default='.md', help='Output file extension (default: .md)')
parser.add_argument('--dust-size', type=int, default=1000, help='File size threshold for moving to dust folder (default: 1000 bytes)')
parser.add_argument('--base-url', help='スクレイピングを開始するベースURL')
parser.add_argument('--url-file', help='スクレイピングするURLが記載されたテキストファイル')
parser.add_argument('output_dir', help='Markdownファイルの出力ディレクトリ')
parser.add_argument('--exclude-selectors', nargs='+', help='除外するCSSセレクター')
parser.add_argument('--include-domain', default='', help='URLマッチングに含めるドメイン')
parser.add_argument('--exclude-keywords', nargs='+', help='URLマッチングから除外するキーワード')
parser.add_argument('--output-extension', default='.md', help='出力ファイルの拡張子 (デフォルト: .md)')
parser.add_argument('--dust-size', type=int, default=1000, help='ダストフォルダに移動するファイルサイズのしきい値 (デフォルト: 1000バイト)')
parser.add_argument('--max-depth', type=int, default=None, help='再帰処理の最大深度 (デフォルト: 制限なし)')
parser.add_argument('--system-message', default=None, help='LiteLLMのシステムメッセージ(サイトの分類に使用)')
parser.add_argument('--classification-prompt', default=None, help='LiteLLMのサイト分類プロンプト(TrueまたはFalseを返すようにしてください)')
parser.add_argument('--max-retries', type=int, default=3, help='フィルタリングのリトライ回数の上限(デフォルト:3)')

args = parser.parse_args()

pegasus = Pegasus(
base_url=args.base_url,
output_dir=args.output_dir,
exclude_selectors=args.exclude_selectors,
include_domain=args.include_domain,
exclude_keywords=args.exclude_keywords,
output_extension=args.output_extension,
dust_size=args.dust_size
dust_size=args.dust_size,
max_depth=args.max_depth,
system_message=args.system_message,
classification_prompt=args.classification_prompt,
max_retries=args.max_retries
)
pegasus.run()

if args.base_url:
pegasus.run(args.base_url)
elif args.url_file:
with open(args.url_file, 'r') as file:
urls = file.read().splitlines()
for url in urls:
pegasus.run(url)
else:
parser.error("--base-url または --url-file のいずれかを指定してください。")

if __name__ == '__main__':
main()
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name='pegasus-surf',
version='0.1.1',
version='0.2.0',
description='A package for scraping websites and converting them to Markdown',
author='Maki',
author_email='sunwood.ai.labs@gmail.com',
Expand All @@ -20,6 +20,9 @@
'beautifulsoup4',
'loguru',
'art',
'litellm',
'python-dotenv',
'google-generativeai',
],
entry_points={
'console_scripts': [
Expand Down
8 changes: 8 additions & 0 deletions urls.example.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
https://iroboteducation.github.io/create3_docs/
https://raw.githubusercontent.com/koalazak/dorita980/master/README.md
https://raw.githubusercontent.com/AtsushiSakai/PyRoombaAdapter/master/README.md
https://qiita.com/Yurix/items/234f7775a1f9d3ad43af
https://raw.githubusercontent.com/docofab/RoombaControlls/main/README.md
https://blog.sikmi.com/slack-bot-roomba
https://qiita.com/oystaar/items/d0a013facd02e8d81479
https://kakaku.com/kaden/vacuum-cleaner/itemlist.aspx?pdf_se=10

0 comments on commit 560914f

Please sign in to comment.