-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge feature/pegasus-scraping-enhancements
- Loading branch information
Showing
7 changed files
with
210 additions
and
91 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -167,4 +167,5 @@ tmp2.md | |
.SourceSageAssets | ||
.aira/aira.Gaiah.md | ||
.harmon_ai/README_template.md | ||
output | ||
output | ||
urls.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,87 +1,161 @@ | ||
# pegasus/pegasus.py | ||
import requests | ||
import markdownify | ||
from bs4 import BeautifulSoup | ||
from urllib.parse import urljoin, urlparse | ||
import os | ||
import re | ||
import loguru | ||
import time | ||
from art import * | ||
from litellm import completion | ||
from tqdm import tqdm | ||
import litellm | ||
# litellm.set_verbose=True | ||
|
||
logger = loguru.logger | ||
|
||
class Pegasus: | ||
def __init__(self, base_url, output_dir, exclude_selectors=None, include_domain=None, exclude_keywords=None, output_extension=".md", dust_size=1000): | ||
self.base_url = base_url | ||
self.output_dir = output_dir | ||
self.exclude_selectors = exclude_selectors | ||
self.include_domain = include_domain | ||
self.exclude_keywords = exclude_keywords | ||
self.visited_urls = set() | ||
self.output_extension = output_extension | ||
self.dust_size = dust_size | ||
tprint(" Pegasus ", font="rnd-xlarge") | ||
logger.info("初期化パラメータ:") | ||
logger.info(f" base_url: {base_url}") | ||
logger.info(f" output_dir: {output_dir}") | ||
logger.info(f" exclude_selectors: {exclude_selectors}") | ||
logger.info(f" include_domain: {include_domain}") | ||
logger.info(f" exclude_keywords: {exclude_keywords}") | ||
logger.info(f" output_extension: {output_extension}") | ||
logger.info(f" dust_size: {dust_size}") | ||
|
||
def download_and_convert(self, url): | ||
os.makedirs(self.output_dir, exist_ok=True) | ||
if url in self.visited_urls: | ||
return | ||
self.visited_urls.add(url) | ||
|
||
try: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
|
||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
if self.exclude_selectors: | ||
for selector in self.exclude_selectors: | ||
for element in soup.select(selector): | ||
element.decompose() | ||
|
||
markdown_content = markdownify.markdownify(str(soup)) | ||
markdown_content = re.sub(r'\n{5,}', '\n\n\n\n', markdown_content) | ||
|
||
parsed_url = urlparse(url) | ||
output_file = f"{self.output_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}" | ||
|
||
if len(markdown_content) < self.dust_size: | ||
dust_dir = os.path.join(self.output_dir, "dust") | ||
os.makedirs(dust_dir, exist_ok=True) | ||
output_file = f"{dust_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}" | ||
|
||
with open(output_file, 'w', encoding='utf-8') as file: | ||
file.write(markdown_content) | ||
|
||
logger.info(f"変換成功: {url} ---> {output_file} [{len(markdown_content)/1000}kb]") | ||
|
||
soup_url = BeautifulSoup(response.text, 'html.parser') | ||
|
||
for link in soup_url.find_all('a'): | ||
href = link.get('href') | ||
if href: | ||
absolute_url = urljoin(url, href) | ||
if self.include_domain and self.include_domain in absolute_url: | ||
if self.exclude_keywords: | ||
if any(keyword in absolute_url for keyword in self.exclude_keywords): | ||
continue | ||
absolute_url = absolute_url.split('#')[0] | ||
self.download_and_convert(absolute_url) | ||
|
||
except requests.exceptions.RequestException as e: | ||
logger.error(f"ダウンロードエラー: {url}: {e}") | ||
except IOError as e: | ||
logger.error(f"書き込みエラー: {output_file}: {e}") | ||
|
||
def run(self): | ||
logger.info(f"スクレイピング開始: base_url={self.base_url}") | ||
self.download_and_convert(self.base_url) | ||
logger.info("スクレイピング完了") | ||
def __init__(self, output_dir, exclude_selectors=None, include_domain=None, exclude_keywords=None, output_extension=".md", dust_size=1000, max_depth=None, system_message=None, classification_prompt=None, max_retries=3): | ||
self.output_dir = output_dir | ||
self.exclude_selectors = exclude_selectors | ||
self.include_domain = include_domain | ||
self.exclude_keywords = exclude_keywords | ||
self.visited_urls = set() | ||
self.output_extension = output_extension | ||
self.dust_size = dust_size | ||
self.max_depth = max_depth | ||
self.domain_summaries = {} | ||
self.system_message = system_message | ||
self.classification_prompt = classification_prompt | ||
self.max_retries = max_retries | ||
tprint(" Pegasus ", font="rnd-xlarge") | ||
logger.info("初期化パラメータ:") | ||
logger.info(f" output_dir: {output_dir}") | ||
logger.info(f" exclude_selectors: {exclude_selectors}") | ||
logger.info(f" include_domain: {include_domain}") | ||
logger.info(f" exclude_keywords: {exclude_keywords}") | ||
logger.info(f" output_extension: {output_extension}") | ||
logger.info(f" dust_size: {dust_size}") | ||
logger.info(f" max_depth: {max_depth}") | ||
logger.info(f" system_message: {system_message}") | ||
logger.info(f" classification_prompt: {classification_prompt}") | ||
logger.info(f" max_retries: {max_retries}") | ||
|
||
def filter_site(self, markdown_content): | ||
if(self.classification_prompt is None): | ||
return True | ||
|
||
retry_count = 0 | ||
while retry_count < self.max_retries: | ||
try: | ||
messages = [ | ||
{"role": "system", "content": self.system_message}, | ||
{"role": "user", "content": f"{self.classification_prompt}\n\n{markdown_content}"} | ||
] | ||
response = completion( | ||
model="gemini/gemini-1.5-pro-latest", | ||
messages=messages | ||
) | ||
content = response.get('choices', [{}])[0].get('message', {}).get('content') | ||
logger.debug(f"content : {content}") | ||
if "true" in content.lower(): | ||
return True | ||
elif "false" in content.lower(): | ||
return False | ||
else: | ||
raise ValueError("分類結果が曖昧です。") | ||
except Exception as e: | ||
retry_count += 1 | ||
logger.warning(f"フィルタリングでエラーが発生しました。リトライします。({retry_count}/{self.max_retries})\nError: {e}") | ||
|
||
if "429" in str(e): | ||
sleep_time = 60 # 60秒スリープ | ||
else: | ||
sleep_time = 10 # その他のエラーの場合は10秒スリープ | ||
|
||
for _ in tqdm(range(sleep_time), desc="Sleeping", unit="s"): | ||
time.sleep(1) | ||
|
||
logger.error(f"フィルタリングに失敗しました。リトライ回数の上限に達しました。({self.max_retries}回)") | ||
return True | ||
|
||
def download_and_convert(self, url, depth=0): | ||
if url in self.visited_urls: | ||
return | ||
self.visited_urls.add(url) | ||
|
||
try: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
|
||
soup = BeautifulSoup(response.text, 'html.parser') | ||
|
||
if self.exclude_selectors: | ||
for selector in self.exclude_selectors: | ||
for element in soup.select(selector): | ||
element.decompose() | ||
|
||
markdown_content = markdownify.markdownify(str(soup)) | ||
markdown_content = re.sub(r'\n{5,}', '\n\n\n\n', markdown_content) | ||
|
||
if not self.filter_site(markdown_content): | ||
parsed_url = urlparse(url) | ||
domain = parsed_url.netloc | ||
domain_dir = os.path.join(self.output_dir, domain) | ||
os.makedirs(domain_dir, exist_ok=True) | ||
excluded_dir = os.path.join(domain_dir, "excluded") | ||
os.makedirs(excluded_dir, exist_ok=True) | ||
output_file = f"{excluded_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}" | ||
else: | ||
parsed_url = urlparse(url) | ||
domain = parsed_url.netloc | ||
domain_dir = os.path.join(self.output_dir, domain) | ||
os.makedirs(domain_dir, exist_ok=True) | ||
|
||
output_file = f"{domain_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}" | ||
|
||
if len(markdown_content) < self.dust_size: | ||
dust_dir = os.path.join(domain_dir, "dust") | ||
os.makedirs(dust_dir, exist_ok=True) | ||
output_file = f"{dust_dir}/{parsed_url.path.replace('/', '_')}{self.output_extension}" | ||
|
||
with open(output_file, 'w', encoding='utf-8') as file: | ||
file.write(markdown_content) | ||
|
||
logger.info(f"[{depth}]変換成功: {url} ---> {output_file} [{len(markdown_content)/1000}kb]") | ||
|
||
if domain not in self.domain_summaries: | ||
self.domain_summaries[domain] = [] | ||
self.domain_summaries[domain].append(f"# {os.path.basename(output_file)}\n\n---\n\n{markdown_content}") | ||
|
||
if self.max_depth is None or depth < self.max_depth: | ||
soup_url = BeautifulSoup(response.text, 'html.parser') | ||
|
||
for link in soup_url.find_all('a'): | ||
href = link.get('href') | ||
if href: | ||
absolute_url = urljoin(url, href) | ||
if (self.include_domain and self.include_domain in absolute_url) or (self.include_domain == ""): | ||
if self.exclude_keywords: | ||
if any(keyword in absolute_url for keyword in self.exclude_keywords): | ||
continue | ||
absolute_url = absolute_url.split('#')[0] | ||
self.download_and_convert(absolute_url, depth + 1) | ||
|
||
except requests.exceptions.RequestException as e: | ||
logger.error(f"ダウンロードエラー: {url}: {e}") | ||
except IOError as e: | ||
logger.error(f"書き込みエラー: {output_file}: {e}") | ||
|
||
def create_domain_summaries(self): | ||
for domain, summaries in self.domain_summaries.items(): | ||
summary_file = os.path.join(self.output_dir, f"{domain}_summary{self.output_extension}") | ||
with open(summary_file, 'w', encoding='utf-8') as file: | ||
file.write('\n\n'.join(summaries)) | ||
logger.info(f"サマリーファイル作成: {summary_file}") | ||
|
||
def run(self, base_url): | ||
logger.info(f"スクレイピング開始: base_url={base_url}") | ||
self.download_and_convert(base_url) | ||
self.create_domain_summaries() | ||
logger.info("スクレイピング完了") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,47 @@ | ||
# pegasus/cli.py | ||
import argparse | ||
from .Pegasus import Pegasus | ||
from dotenv import load_dotenv | ||
load_dotenv(verbose=True) | ||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description='Pegasus') | ||
parser.add_argument('base_url', help='Base URL to start scraping') | ||
parser.add_argument('output_dir', help='Output directory for markdown files') | ||
parser.add_argument('--exclude-selectors', nargs='+', help='CSS selectors to exclude') | ||
parser.add_argument('--include-domain', help='Domain to include in URL matching') | ||
parser.add_argument('--exclude-keywords', nargs='+', help='Keywords to exclude in URL matching') | ||
parser.add_argument('--output-extension', default='.md', help='Output file extension (default: .md)') | ||
parser.add_argument('--dust-size', type=int, default=1000, help='File size threshold for moving to dust folder (default: 1000 bytes)') | ||
parser.add_argument('--base-url', help='スクレイピングを開始するベースURL') | ||
parser.add_argument('--url-file', help='スクレイピングするURLが記載されたテキストファイル') | ||
parser.add_argument('output_dir', help='Markdownファイルの出力ディレクトリ') | ||
parser.add_argument('--exclude-selectors', nargs='+', help='除外するCSSセレクター') | ||
parser.add_argument('--include-domain', default='', help='URLマッチングに含めるドメイン') | ||
parser.add_argument('--exclude-keywords', nargs='+', help='URLマッチングから除外するキーワード') | ||
parser.add_argument('--output-extension', default='.md', help='出力ファイルの拡張子 (デフォルト: .md)') | ||
parser.add_argument('--dust-size', type=int, default=1000, help='ダストフォルダに移動するファイルサイズのしきい値 (デフォルト: 1000バイト)') | ||
parser.add_argument('--max-depth', type=int, default=None, help='再帰処理の最大深度 (デフォルト: 制限なし)') | ||
parser.add_argument('--system-message', default=None, help='LiteLLMのシステムメッセージ(サイトの分類に使用)') | ||
parser.add_argument('--classification-prompt', default=None, help='LiteLLMのサイト分類プロンプト(TrueまたはFalseを返すようにしてください)') | ||
parser.add_argument('--max-retries', type=int, default=3, help='フィルタリングのリトライ回数の上限(デフォルト:3)') | ||
|
||
args = parser.parse_args() | ||
|
||
pegasus = Pegasus( | ||
base_url=args.base_url, | ||
output_dir=args.output_dir, | ||
exclude_selectors=args.exclude_selectors, | ||
include_domain=args.include_domain, | ||
exclude_keywords=args.exclude_keywords, | ||
output_extension=args.output_extension, | ||
dust_size=args.dust_size | ||
dust_size=args.dust_size, | ||
max_depth=args.max_depth, | ||
system_message=args.system_message, | ||
classification_prompt=args.classification_prompt, | ||
max_retries=args.max_retries | ||
) | ||
pegasus.run() | ||
|
||
if args.base_url: | ||
pegasus.run(args.base_url) | ||
elif args.url_file: | ||
with open(args.url_file, 'r') as file: | ||
urls = file.read().splitlines() | ||
for url in urls: | ||
pegasus.run(url) | ||
else: | ||
parser.error("--base-url または --url-file のいずれかを指定してください。") | ||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
https://iroboteducation.github.io/create3_docs/ | ||
https://raw.githubusercontent.com/koalazak/dorita980/master/README.md | ||
https://raw.githubusercontent.com/AtsushiSakai/PyRoombaAdapter/master/README.md | ||
https://qiita.com/Yurix/items/234f7775a1f9d3ad43af | ||
https://raw.githubusercontent.com/docofab/RoombaControlls/main/README.md | ||
https://blog.sikmi.com/slack-bot-roomba | ||
https://qiita.com/oystaar/items/d0a013facd02e8d81479 | ||
https://kakaku.com/kaden/vacuum-cleaner/itemlist.aspx?pdf_se=10 |