diff --git a/README.md b/README.md index 3f3c67f79..ff35d36cc 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ The agent can produce detailed, factual and unbiased research reports, with cust ## Why GPT Researcher? + - To form objective conclusions for manual research tasks can take time, sometimes weeks to find the right resources and information. - Current LLMs are trained on past and outdated information, with heavy risks of hallucinations, making them almost irrelevant for research tasks. - Solutions that enable web search (such as ChatGPT + Web Plugin), only consider limited resources and content that in some cases result in superficial conclusions or biased answers. diff --git a/actions/web_search.py b/actions/web_search.py deleted file mode 100644 index fd0daaf32..000000000 --- a/actions/web_search.py +++ /dev/null @@ -1,124 +0,0 @@ -from __future__ import annotations -import os -import json -import requests -from duckduckgo_search import DDGS -from tavily import TavilyClient -from langchain.utilities import SearxSearchWrapper -from config import Config - -CFG = Config() - - -def web_search(query: str, num_results: int = 4) -> str: - """Useful for general internet search queries.""" - print("Searching with query {0}...".format(query)) - search_results = [] - search_response = [] - if not query: - return json.dumps(search_results) - - if CFG.search_api == "tavily": - results = tavily_web_search(query, num_results) - # Normalizing results to match the format of the other search APIs - search_response = [{"href": obj["url"], "body": obj["content"]} for obj in results] - - elif CFG.search_api == "googleSerp": - return serp_web_search(os.environ["SERP_API_KEY"], query, num_results) - - elif CFG.search_api == "googleAPI": - return google_web_search(os.environ["GOOGLE_API_KEY"], os.environ["GOOGLE_CX"], query, num_results) - - elif CFG.search_api == "searx": - searx = SearxSearchWrapper(searx_host=os.environ["SEARX_URL"]) - results = searx.results(query, num_results) - # Normalizing results to match the format of the other search APIs - search_response = [{"href": obj["link"], "body": obj["snippet"]} for obj in results] - - elif CFG.search_api == "duckduckgo": - ddgs = DDGS() - search_response = ddgs.text(query) - - total_added = 0 - for j in search_response: - search_results.append(j) - total_added += 1 - if total_added >= num_results: - break - - return json.dumps(search_results, ensure_ascii=False, indent=4) - -def tavily_web_search(query: str, num_results: int = 10): - tavily_search = TavilyClient(os.environ["TAVILY_API_KEY"]) - try: - results = tavily_search.search(query, search_depth="advanced", max_results=num_results).get("results", []) - except Exception as e: - print(e) - results = [] - return results - -def serp_web_search(serp_api_key:str, query: str, num_results: int = 10) -> str: - """Useful for general internet search queries using the Serp API.""" - url = "https://serpapi.com/search.json?engine=google&q=" + query + "&api_key=" + serp_api_key - resp = requests.request("GET", url) - - if resp is None: - return - try: - search_results = json.loads(resp.text) - except Exception: - return - if search_results is None: - return - - results = search_results["organic_results"] - search_results = [] - - # Normalizing results to match the format of the other search APIs - for result in results: - # skip youtube results - if "youtube.com" in result["link"]: - continue - search_result = { - "title": result["title"], - "href": result["link"], - "body": result["snippet"], - } - search_results.append(search_result) - print("Searching with query {0}...".format(query)) - return json.dumps(search_results, ensure_ascii=False, indent=4) - - -def google_web_search(google_api_key:str, google_cx:str, query: str, num_result: int = 10) -> str: - """Useful for general internet search queries using the Google API.""" - - url = f"https://www.googleapis.com/customsearch/v1?key={google_api_key}&cx={google_cx}&q={query}&start=1" - resp = requests.get(url) - - if resp is None: - return - try: - search_results = json.loads(resp.text) - except Exception: - return - if search_results is None: - return - - results = search_results.get("items", []) - search_results = [] - - # Normalizing results to match the format of the other search APIs - for result in results: - # skip youtube results - if "youtube.com" in result["link"]: - continue - search_result = { - "title": result["title"], - "href": result["link"], - "body": result["snippet"], - } - search_results.append(search_result) - - print("Searching with query {0}...".format(query)) - - return json.dumps(search_results[:num_result], ensure_ascii=False, indent=4) \ No newline at end of file diff --git a/agent/prompts.py b/agent/prompts.py deleted file mode 100644 index a9a86cb8d..000000000 --- a/agent/prompts.py +++ /dev/null @@ -1,142 +0,0 @@ -from datetime import datetime -def generate_agent_role_prompt(agent): - """ Generates the agent role prompt. - Args: agent (str): The type of the agent. - Returns: str: The agent role prompt. - """ - prompts = { - "Finance Agent": "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends.", - "Travel Agent": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights.", - "Academic Research Agent": "You are an AI academic research assistant. Your primary responsibility is to create thorough, academically rigorous, unbiased, and systematically organized reports on a given research topic, following the standards of scholarly work.", - "Business Analyst": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis.", - "Computer Security Analyst Agent": "You are an AI specializing in computer security analysis. Your principal duty is to generate comprehensive, meticulously detailed, impartial, and systematically structured reports on computer security topics. This includes Exploits, Techniques, Threat Actors, and Advanced Persistent Threat (APT) Groups. All produced reports should adhere to the highest standards of scholarly work and provide in-depth insights into the complexities of computer security.", - "Default Agent": "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text." - } - - return prompts.get(agent, "No such agent") - - -def generate_report_prompt(question, research_summary, report_format="apa", report_length=1500): - """ Generates the report prompt for the given question and research summary. - Args: question (str): The question to generate the report prompt for - research_summary (str): The research summary to generate the report prompt for - Returns: str: The report prompt for the given question and research summary - """ - - return f'Information: """{research_summary}"""\n\n' \ - f'Using the above information, answer the following'\ - f' question or topic: "{question}" in a detailed report --'\ - " The report should focus on the answer to the question, should be well structured, informative," \ - f" in depth, with facts and numbers if available and a minimum of {report_length} words.\n" \ - "You should strive to write the report as long as you can using all relevant and necessary information provided.\n" \ - "You must write the report with markdown syntax.\n "\ - "You MUST determine your own concrete and valid opinion based on the given information. Do NOT deter to general and meaningless conclusions.\n" \ - f"Write all used source urls at the end of the report, and make sure to not add duplicated sources, but only one reference for each.\n" \ - f"You must write the report in {report_format} format.\n " \ - f"Please do your best, this is very important to my career. "\ - f"Assume that the current date is {datetime.now().strftime('%B %d, %Y')}" - -def generate_search_queries_prompt(question): - """ Generates the search queries prompt for the given question. - Args: question (str): The question to generate the search queries prompt for - Returns: str: The search queries prompt for the given question - """ - - return f'Write 3 google search queries to search online that form an objective opinion from the following: "{question}"'\ - f'Use the current date if needed: {datetime.now().strftime("%B %d, %Y")}.\n' \ - f'You must respond with a list of strings in the following format: ["query 1", "query 2", "query 3"].' - - -def generate_resource_report_prompt(question, research_summary): - """Generates the resource report prompt for the given question and research summary. - - Args: - question (str): The question to generate the resource report prompt for. - research_summary (str): The research summary to generate the resource report prompt for. - - Returns: - str: The resource report prompt for the given question and research summary. - """ - return f'"""{research_summary}""" Based on the above information, generate a bibliography recommendation report for the following' \ - f' question or topic: "{question}". The report should provide a detailed analysis of each recommended resource,' \ - ' explaining how each source can contribute to finding answers to the research question.' \ - ' Focus on the relevance, reliability, and significance of each source.' \ - ' Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax.' \ - ' Include relevant facts, figures, and numbers whenever available.' \ - ' The report should have a minimum length of 1,200 words.' - - -def generate_outline_report_prompt(question, research_summary): - """ Generates the outline report prompt for the given question and research summary. - Args: question (str): The question to generate the outline report prompt for - research_summary (str): The research summary to generate the outline report prompt for - Returns: str: The outline report prompt for the given question and research summary - """ - - return f'"""{research_summary}""" Using the above information, generate an outline for a research report in Markdown syntax'\ - f' for the following question or topic: "{question}". The outline should provide a well-structured framework'\ - ' for the research report, including the main sections, subsections, and key points to be covered.' \ - ' The research report should be detailed, informative, in-depth, and a minimum of 1,200 words.' \ - ' Use appropriate Markdown syntax to format the outline and ensure readability.' - -def generate_concepts_prompt(question, research_summary): - """ Generates the concepts prompt for the given question. - Args: question (str): The question to generate the concepts prompt for - research_summary (str): The research summary to generate the concepts prompt for - Returns: str: The concepts prompt for the given question - """ - - return f'"""{research_summary}""" Using the above information, generate a list of 5 main concepts to learn for a research report'\ - f' on the following question or topic: "{question}". The outline should provide a well-structured framework'\ - 'You must respond with a list of strings in the following format: ["concepts 1", "concepts 2", "concepts 3", "concepts 4, concepts 5"]' - - -def generate_lesson_prompt(concept): - """ - Generates the lesson prompt for the given question. - Args: - concept (str): The concept to generate the lesson prompt for. - Returns: - str: The lesson prompt for the given concept. - """ - - prompt = f'generate a comprehensive lesson about {concept} in Markdown syntax. This should include the definition'\ - f'of {concept}, its historical background and development, its applications or uses in different'\ - f'fields, and notable events or facts related to {concept}.' - - return prompt - -def get_report_by_type(report_type): - report_type_mapping = { - 'research_report': generate_report_prompt, - 'resource_report': generate_resource_report_prompt, - 'outline_report': generate_outline_report_prompt - } - return report_type_mapping[report_type] - -def auto_agent_instructions(): - return """ - This task involves researching a given topic, regardless of its complexity or the availability of a definitive answer. The research is conducted by a specific agent, defined by its type and role, with each agent requiring distinct instructions. - Agent - The agent is determined by the field of the topic and the specific name of the agent that could be utilized to research the topic provided. Agents are categorized by their area of expertise, and each agent type is associated with a corresponding emoji. - - examples: - task: "should I invest in apple stocks?" - response: - { - "agent": "π° Finance Agent", - "agent_role_prompt: "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends." - } - task: "could reselling sneakers become profitable?" - response: - { - "agent": "π Business Analyst Agent", - "agent_role_prompt": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis." - } - task: "what are the most interesting sites in Tel Aviv?" - response: - { - "agent: "π Travel Agent", - "agent_role_prompt": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights." - } - """ diff --git a/agent/research_agent.py b/agent/research_agent.py deleted file mode 100644 index 17de24d0b..000000000 --- a/agent/research_agent.py +++ /dev/null @@ -1,188 +0,0 @@ -# Description: Research assistant class that handles the research process for a given question. - -# libraries -import asyncio -import json -import hashlib - -from actions.web_search import web_search -from actions.web_scrape import async_browse -from processing.text import \ - write_to_file, \ - create_message, \ - create_chat_completion, \ - read_txt_files, \ - write_md_to_pdf -from config import Config -from agent import prompts -import os -import string - - -CFG = Config() - - -class ResearchAgent: - def __init__(self, question, agent, agent_role_prompt, websocket=None): - """ Initializes the research assistant with the given question. - Args: question (str): The question to research - Returns: None - """ - - self.question = question - self.agent = agent - self.agent_role_prompt = agent_role_prompt if agent_role_prompt else prompts.generate_agent_role_prompt(agent) - self.visited_urls = set() - self.research_summary = "" - self.dir_path = f"./outputs/{hashlib.sha1(question.encode()).hexdigest()}" - self.websocket = websocket - - async def stream_output(self, output): - if not self.websocket: - return print(output) - await self.websocket.send_json({"type": "logs", "output": output}) - - - async def summarize(self, text, topic): - """ Summarizes the given text for the given topic. - Args: text (str): The text to summarize - topic (str): The topic to summarize the text for - Returns: str: The summarized text - """ - - messages = [create_message(text, topic)] - await self.stream_output(f"π Summarizing text for query: {text}") - - return create_chat_completion( - model=CFG.fast_llm_model, - messages=messages, - ) - - async def get_new_urls(self, url_set_input): - """ Gets the new urls from the given url set. - Args: url_set_input (set[str]): The url set to get the new urls from - Returns: list[str]: The new urls from the given url set - """ - - new_urls = [] - for url in url_set_input: - if url not in self.visited_urls: - await self.stream_output(f"β Adding source url to research: {url}\n") - - self.visited_urls.add(url) - new_urls.append(url) - - return new_urls - - async def call_agent(self, action, stream=False, websocket=None): - messages = [{ - "role": "system", - "content": self.agent_role_prompt - }, { - "role": "user", - "content": action, - }] - answer = create_chat_completion( - model=CFG.smart_llm_model, - messages=messages, - stream=stream, - websocket=websocket, - ) - return answer - - async def create_search_queries(self): - """ Creates the search queries for the given question. - Args: None - Returns: list[str]: The search queries for the given question - """ - result = await self.call_agent(prompts.generate_search_queries_prompt(self.question)) - await self.stream_output(f"π§ I will conduct my research based on the following queries: {result}...") - return json.loads(result) - - async def async_search(self, query): - """ Runs the async search for the given query. - Args: query (str): The query to run the async search for - Returns: list[str]: The async search for the given query - """ - search_results = json.loads(web_search(query)) - new_search_urls = self.get_new_urls([url.get("href") for url in search_results]) - - await self.stream_output(f"π Browsing the following sites for relevant information: {new_search_urls}...") - - # Create a list to hold the coroutine objects - tasks = [async_browse(url, query, self.websocket) for url in await new_search_urls] - - # Gather the results as they become available - responses = await asyncio.gather(*tasks, return_exceptions=True) - - return responses - - async def run_search_summary(self, query): - """ Runs the search summary for the given query. - Args: query (str): The query to run the search summary for - Returns: str: The search summary for the given query - """ - - await self.stream_output(f"π Running research for '{query}'...") - - responses = await self.async_search(query) - - result = "\n".join(responses) - os.makedirs(os.path.dirname(f"{self.dir_path}/research-{query}.txt"), exist_ok=True) - write_to_file(f"{self.dir_path}/research-{query}.txt", result) - return result - - async def conduct_research(self): - """ Conducts the research for the given question. - Args: None - Returns: str: The research for the given question - """ - self.research_summary = read_txt_files(self.dir_path) if os.path.isdir(self.dir_path) else "" - - if not self.research_summary: - search_queries = await self.create_search_queries() - for query in search_queries: - research_result = await self.run_search_summary(query) - self.research_summary += f"{research_result}\n\n" - - await self.stream_output(f"Total research words: {len(self.research_summary.split(' '))}") - - return self.research_summary - - - async def create_concepts(self): - """ Creates the concepts for the given question. - Args: None - Returns: list[str]: The concepts for the given question - """ - result = self.call_agent(prompts.generate_concepts_prompt(self.question, self.research_summary)) - - await self.stream_output(f"I will research based on the following concepts: {result}\n") - return json.loads(result) - - async def write_report(self, report_type, websocket=None): - """ Writes the report for the given question. - Args: None - Returns: str: The report for the given question - """ - report_type_func = prompts.get_report_by_type(report_type) - await self.stream_output(f"βοΈ Writing {report_type} for research task: {self.question}...") - - answer = await self.call_agent(report_type_func(self.question, self.research_summary), - stream=websocket is not None, websocket=websocket) - # if websocket is True than we are streaming gpt response, so we need to wait for the final response - final_report = await answer if websocket else answer - - path = await write_md_to_pdf(report_type, self.dir_path, final_report) - - return answer, path - - async def write_lessons(self): - """ Writes lessons on essential concepts of the research. - Args: None - Returns: None - """ - concepts = await self.create_concepts() - for concept in concepts: - answer = await self.call_agent(prompts.generate_lesson_prompt(concept), stream=True) - await write_md_to_pdf("Lesson", self.dir_path, answer) diff --git a/agent/run.py b/agent/run.py deleted file mode 100644 index 1fdef384c..000000000 --- a/agent/run.py +++ /dev/null @@ -1,60 +0,0 @@ -import asyncio -import datetime - -from typing import List, Dict -from fastapi import WebSocket -from config import check_config_setup -from agent.research_agent import ResearchAgent - - -class WebSocketManager: - def __init__(self): - self.active_connections: List[WebSocket] = [] - self.sender_tasks: Dict[WebSocket, asyncio.Task] = {} - self.message_queues: Dict[WebSocket, asyncio.Queue] = {} - - async def start_sender(self, websocket: WebSocket): - queue = self.message_queues[websocket] - while True: - message = await queue.get() - if websocket in self.active_connections: - await websocket.send_text(message) - else: - break - - async def connect(self, websocket: WebSocket): - await websocket.accept() - self.active_connections.append(websocket) - self.message_queues[websocket] = asyncio.Queue() - self.sender_tasks[websocket] = asyncio.create_task(self.start_sender(websocket)) - - async def disconnect(self, websocket: WebSocket): - self.active_connections.remove(websocket) - self.sender_tasks[websocket].cancel() - del self.sender_tasks[websocket] - del self.message_queues[websocket] - - async def start_streaming(self, task, report_type, agent, agent_role_prompt, websocket): - report, path = await run_agent(task, report_type, agent, agent_role_prompt, websocket) - return report, path - - -async def run_agent(task, report_type, agent, agent_role_prompt, websocket): - check_config_setup() - - start_time = datetime.datetime.now() - - # await websocket.send_json({"type": "logs", "output": f"Start time: {str(start_time)}\n\n"}) - - assistant = ResearchAgent(task, agent, agent_role_prompt, websocket) - await assistant.conduct_research() - - report, path = await assistant.write_report(report_type, websocket) - - await websocket.send_json({"type": "path", "output": path}) - - end_time = datetime.datetime.now() - await websocket.send_json({"type": "logs", "output": f"\nEnd time: {end_time}\n"}) - await websocket.send_json({"type": "logs", "output": f"\nTotal run time: {end_time - start_time}\n"}) - - return report, path diff --git a/backend/server.py b/backend/server.py new file mode 100644 index 000000000..5b49bbd57 --- /dev/null +++ b/backend/server.py @@ -0,0 +1,58 @@ +from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect +from fastapi.staticfiles import StaticFiles +from fastapi.templating import Jinja2Templates +from pydantic import BaseModel +import json +import os +from gpt_researcher.utils.websocket_manager import WebSocketManager +from .utils import write_md_to_pdf + + +class ResearchRequest(BaseModel): + task: str + report_type: str + agent: str + + +app = FastAPI() + +app.mount("/site", StaticFiles(directory="./frontend"), name="site") +app.mount("/static", StaticFiles(directory="./frontend/static"), name="static") + +templates = Jinja2Templates(directory="./frontend") + +manager = WebSocketManager() + + +# Dynamic directory for outputs once first research is run +@app.on_event("startup") +def startup_event(): + if not os.path.isdir("outputs"): + os.makedirs("outputs") + app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") + +@app.get("/") +async def read_root(request: Request): + return templates.TemplateResponse('index.html', {"request": request, "report": None}) + + +@app.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket): + await manager.connect(websocket) + try: + while True: + data = await websocket.receive_text() + if data.startswith("start"): + json_data = json.loads(data[6:]) + task = json_data.get("task") + report_type = json_data.get("report_type") + if task and report_type: + report = await manager.start_streaming(task, report_type, websocket) + path = await write_md_to_pdf(report) + await websocket.send_json({"type": "path", "output": path}) + else: + print("Error: not enough parameters provided.") + + except WebSocketDisconnect: + await manager.disconnect(websocket) + diff --git a/backend/utils.py b/backend/utils.py new file mode 100644 index 000000000..d60adb757 --- /dev/null +++ b/backend/utils.py @@ -0,0 +1,44 @@ +import aiofiles +import urllib +import uuid +from md2pdf.core import md2pdf + +async def write_to_file(filename: str, text: str) -> None: + """Asynchronously write text to a file in UTF-8 encoding. + + Args: + filename (str): The filename to write to. + text (str): The text to write. + """ + # Convert text to UTF-8, replacing any problematic characters + text_utf8 = text.encode('utf-8', errors='replace').decode('utf-8') + + async with aiofiles.open(filename, "w", encoding='utf-8') as file: + await file.write(text_utf8) + +async def write_md_to_pdf(text: str) -> str: + """Converts Markdown text to a PDF file and returns the file path. + + Args: + text (str): Markdown text to convert. + + Returns: + str: The encoded file path of the generated PDF. + """ + task = uuid.uuid4().hex + file_path = f"outputs/{task}" + await write_to_file(f"{file_path}.md", text) + + try: + md2pdf(f"{file_path}.pdf", + md_content=None, + md_file_path=f"{file_path}.md", + css_file_path=None, + base_url=None) + print(f"Report written to {file_path}.pdf") + except Exception as e: + print(f"Error in converting Markdown to PDF: {e}") + return "" + + encoded_file_path = urllib.parse.quote(f"{file_path}.pdf") + return encoded_file_path diff --git a/config.json b/config.json new file mode 100644 index 000000000..630e4d9d8 --- /dev/null +++ b/config.json @@ -0,0 +1,17 @@ +{ + "openai_api_key": null, + "debug_mode": false, + "allow_downloads": false, + "selenium_web_browser": "chrome", + "search_api": "tavily", + "llm_provider": "ChatOpenAI", + "fast_llm_model": "gpt-3.5-turbo-16k", + "smart_llm_model": "gpt-4", + "fast_token_limit": 2000, + "smart_token_limit": 4000, + "browse_chunk_max_length": 8192, + "summary_token_limit": 700, + "temperature": 1.0, + "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", + "memory_backend": "local" +} \ No newline at end of file diff --git a/config/__init__.py b/config/__init__.py deleted file mode 100644 index ec7b37209..000000000 --- a/config/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from config.config import Config, check_config_setup -from config.singleton import AbstractSingleton, Singleton - -__all__ = [ - "check_config_setup", - "AbstractSingleton", - "Config", - "Singleton", -] diff --git a/config/config.py b/config/config.py deleted file mode 100644 index 620d35140..000000000 --- a/config/config.py +++ /dev/null @@ -1,148 +0,0 @@ -"""Configuration class to store the state of bools for different scripts access.""" -import os -import sys -import openai -from colorama import Fore -from dotenv import load_dotenv - -from config.singleton import Singleton - -load_dotenv(verbose=True) - -class Config(metaclass=Singleton): - """ - Configuration class to store the state of bools for different scripts access. - """ - - def __init__(self) -> None: - """Initialize the Config class""" - self.debug_mode = False - self.allow_downloads = False - - self.selenium_web_browser = os.getenv("USE_WEB_BROWSER", "chrome") - self.search_api = os.getenv("SEARCH_API", "tavily") - self.llm_provider = os.getenv("LLM_PROVIDER", "ChatOpenAI") - self.fast_llm_model = os.getenv("FAST_LLM_MODEL", "gpt-3.5-turbo-16k") - self.smart_llm_model = os.getenv("SMART_LLM_MODEL", "gpt-4-1106-preview") - self.fast_token_limit = int(os.getenv("FAST_TOKEN_LIMIT", 2000)) - self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 4000)) - self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 8192)) - self.summary_token_limit = int(os.getenv("SUMMARY_TOKEN_LIMIT", 1000)) - - self.openai_api_key = os.getenv("OPENAI_API_KEY") - self.temperature = float(os.getenv("TEMPERATURE", "0.4")) - - self.user_agent = os.getenv( - "USER_AGENT", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36" - " (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36", - ) - - self.memory_backend = os.getenv("MEMORY_BACKEND", "local") - # Initialize the OpenAI API client - openai.api_key = self.openai_api_key - - def set_fast_llm_model(self, value: str) -> None: - """Set the fast LLM model value.""" - self.fast_llm_model = value - - def set_smart_llm_model(self, value: str) -> None: - """Set the smart LLM model value.""" - self.smart_llm_model = value - - def set_fast_token_limit(self, value: int) -> None: - """Set the fast token limit value.""" - self.fast_token_limit = value - - def set_smart_token_limit(self, value: int) -> None: - """Set the smart token limit value.""" - self.smart_token_limit = value - - def set_browse_chunk_max_length(self, value: int) -> None: - """Set the browse_website command chunk max length value.""" - self.browse_chunk_max_length = value - - def set_openai_api_key(self, value: str) -> None: - """Set the OpenAI API key value.""" - self.openai_api_key = value - - def set_debug_mode(self, value: bool) -> None: - """Set the debug mode value.""" - self.debug_mode = value - -class APIKeyError(Exception): - """ - Exception raised when an API key is not set in config.py or as an environment variable. - """ - def __init__(self, service_name: str): - self.service_name = service_name - - def __str__(self): - if self.service_name == "Tavily": - service_env = "TAVILY_API_KEY" - link = "https://app.tavily.com" - elif self.service_name == "GoogleSerp": - service_env = "SERP_API_KEY" - link = "https://serper.dev/" - elif self.service_name == "Google": - service_env = "Google_API_KEY and GOOGLE_CX" - link = "https://developers.google.com/custom-search/v1/overview" - elif self.service_name == "Searx": - service_env = "SEARX_URL" - link = "https://searx.space/" - elif self.service_name == "OpenAI": - link = "https://platform.openai.com/account/api-keys" - return ( - Fore.RED - + "Please set your OpenAI API key in .env or as an environment variable.\n" - + "You can get your key from https://platform.openai.com/account/api-keys" - ) - - return ( - Fore.RED - + f"Please set your {self.service_name} API key in .env or as an environment variable '{service_env}'.\n" - + f"You can get your key from {link} \n" - + "Alternatively, you can change the 'search_api' value in config.py to 'duckduckgo'." - ) - -def check_config_setup() -> None: - cfg = Config() - check_openai_api_key(cfg) - if cfg.search_api == "tavily": - check_tavily_api_key(cfg) - elif cfg.search_api == "googleAPI": - check_google_api_key(cfg) - elif cfg.search_api == "googleSerp": - check_serp_api_key(cfg) - elif cfg.search_api == "searx": - check_searx_url(cfg) - -def check_openai_api_key(cfg) -> None: - """Check if the OpenAI API key is set in config.py or as an environment variable.""" - if not cfg.openai_api_key: - raise APIKeyError("OpenAI") - -def check_tavily_api_key(cfg) -> None: - """Check if the Tavily Search API key is set in config.py or as an environment variable.""" - tavily_api_key = os.getenv("TAVILY_API_KEY") - if not tavily_api_key and cfg.search_api == "tavily": - raise APIKeyError("Tavily") - -def check_google_api_key(cfg) -> None: - """Check if the Google API key is set in config.py or as an environment variable.""" - google_api_key = os.getenv("GOOGLE_API_KEY") - google_cx = os.getenv("GOOGLE_CX") - if not google_api_key and not google_cx and cfg.search_api == "googleAPI": - raise APIKeyError("Google") - -def check_serp_api_key(cfg) -> None: - """Check if the SERP API key is set in config.py or as an environment variable.""" - serp_api_key = os.getenv("SERP_API_KEY") - if not serp_api_key and cfg.search_api == "googleSerp": - raise APIKeyError("GoogleSerp") - -def check_searx_url(cfg) -> None: - """Check if the Searx URL is set in config.py or as an environment variable.""" - searx_url = os.getenv("SEARX_URL") - if not searx_url and cfg.search_api == "searx": - raise APIKeyError("Searx") \ No newline at end of file diff --git a/config/singleton.py b/config/singleton.py deleted file mode 100644 index 55b2aeea1..000000000 --- a/config/singleton.py +++ /dev/null @@ -1,24 +0,0 @@ -"""The singleton metaclass for ensuring only one instance of a class.""" -import abc - - -class Singleton(abc.ABCMeta, type): - """ - Singleton metaclass for ensuring only one instance of a class. - """ - - _instances = {} - - def __call__(cls, *args, **kwargs): - """Call method for the singleton metaclass.""" - if cls not in cls._instances: - cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) - return cls._instances[cls] - - -class AbstractSingleton(abc.ABC, metaclass=Singleton): - """ - Abstract singleton class for ensuring only one instance of a class. - """ - - pass diff --git a/permchain_example/README.md b/examples/permchain_agents/README.md similarity index 100% rename from permchain_example/README.md rename to examples/permchain_agents/README.md diff --git a/permchain_example/editor_actors/editor.py b/examples/permchain_agents/editor_actors/editor.py similarity index 100% rename from permchain_example/editor_actors/editor.py rename to examples/permchain_agents/editor_actors/editor.py diff --git a/permchain_example/research_team.py b/examples/permchain_agents/research_team.py similarity index 100% rename from permchain_example/research_team.py rename to examples/permchain_agents/research_team.py diff --git a/permchain_example/researcher.py b/examples/permchain_agents/researcher.py similarity index 100% rename from permchain_example/researcher.py rename to examples/permchain_agents/researcher.py diff --git a/permchain_example/reviser_actors/reviser.py b/examples/permchain_agents/reviser_actors/reviser.py similarity index 100% rename from permchain_example/reviser_actors/reviser.py rename to examples/permchain_agents/reviser_actors/reviser.py diff --git a/permchain_example/search_actors/gpt_researcher.py b/examples/permchain_agents/search_actors/gpt_researcher.py similarity index 85% rename from permchain_example/search_actors/gpt_researcher.py rename to examples/permchain_agents/search_actors/gpt_researcher.py index 899ec25fb..c118af30b 100644 --- a/permchain_example/search_actors/gpt_researcher.py +++ b/examples/permchain_agents/search_actors/gpt_researcher.py @@ -1,14 +1,14 @@ import json -from processing.text import summarize_text -from actions.web_scrape import scrape_text_with_selenium -from actions.web_search import web_search +from gpt_researcher.processing.text import summarize_text +from gpt_researcher.actions.web_scrape import scrape_text_with_selenium +from gpt_researcher.actions.web_search import web_search from langchain.chat_models import ChatOpenAI from langchain.prompts import ChatPromptTemplate from langchain.schema.output_parser import StrOutputParser -from langchain.schema.runnable import RunnableMap, RunnableLambda +from langchain.schema.runnable import RunnableMap from langchain.schema.messages import SystemMessage -from agent.prompts import auto_agent_instructions, generate_search_queries_prompt +from gpt_researcher.retriever.prompts import auto_agent_instructions, generate_search_queries_prompt from config import Config CFG = Config() diff --git a/permchain_example/search_actors/search_api.py b/examples/permchain_agents/search_actors/search_api.py similarity index 100% rename from permchain_example/search_actors/search_api.py rename to examples/permchain_agents/search_actors/search_api.py diff --git a/permchain_example/test.py b/examples/permchain_agents/test.py similarity index 58% rename from permchain_example/test.py rename to examples/permchain_agents/test.py index 736a0fd2a..c8b0a86a7 100644 --- a/permchain_example/test.py +++ b/examples/permchain_agents/test.py @@ -1,15 +1,14 @@ # main import os, sys -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) -from permchain_example.researcher import Researcher -from permchain_example.search_actors.search_api import TavilySearchActor -from permchain_example.editor_actors.editor import EditorActor -from permchain_example.reviser_actors.reviser import ReviserActor -from permchain_example.search_actors.gpt_researcher import GPTResearcherActor -from permchain_example.writer_actors.writer import WriterActor -from permchain_example.research_team import ResearchTeam -from processing.text import md_to_pdf +from examples.permchain_agents.researcher import Researcher +from examples.permchain_agents.editor_actors.editor import EditorActor +from examples.permchain_agents.reviser_actors.reviser import ReviserActor +from examples.permchain_agents.search_actors.gpt_researcher import GPTResearcherActor +from examples.permchain_agents.writer_actors.writer import WriterActor +from examples.permchain_agents.research_team import ResearchTeam +from scraping.processing.text import md_to_pdf diff --git a/permchain_example/writer_actors/writer.py b/examples/permchain_agents/writer_actors/writer.py similarity index 88% rename from permchain_example/writer_actors/writer.py rename to examples/permchain_agents/writer_actors/writer.py index e65421b20..aa4ffce14 100644 --- a/permchain_example/writer_actors/writer.py +++ b/examples/permchain_agents/writer_actors/writer.py @@ -1,7 +1,7 @@ from langchain.prompts import ChatPromptTemplate from langchain.chat_models import ChatOpenAI from langchain.schema.output_parser import StrOutputParser -from agent.prompts import generate_report_prompt, generate_agent_role_prompt +from gpt_researcher_old.retriever.prompts import generate_report_prompt, generate_agent_role_prompt from config import Config CFG = Config() diff --git a/examples/sample_report.py b/examples/sample_report.py new file mode 100644 index 000000000..6330abe1f --- /dev/null +++ b/examples/sample_report.py @@ -0,0 +1,22 @@ +from gpt_researcher import GPTResearcher +import asyncio + + +async def main(): + """ + This is a sample script that shows how to run a research report. + """ + # Query + query = "What happened in the latest burning man floods?" + + # Report Type + report_type = "research_report" + + # Run Research + researcher = GPTResearcher(query=query, report_type=report_type, config_path=None) + report = await researcher.run() + return report + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/client/index.html b/frontend/index.html similarity index 97% rename from client/index.html rename to frontend/index.html index d3447b277..6ef261fd2 100644 --- a/client/index.html +++ b/frontend/index.html @@ -95,7 +95,7 @@
elements + text += element.text + "\n" + return text \ No newline at end of file diff --git a/agent/llm_utils.py b/gpt_researcher/utils/llm.py similarity index 58% rename from agent/llm_utils.py rename to gpt_researcher/utils/llm.py index ed432ebee..96f560ad7 100644 --- a/agent/llm_utils.py +++ b/gpt_researcher/utils/llm.py @@ -1,32 +1,22 @@ +# libraries from __future__ import annotations - import json - from fastapi import WebSocket -import time - -import openai from langchain.adapters import openai as lc_openai from colorama import Fore, Style -from openai.error import APIError, RateLimitError - -from agent.prompts import auto_agent_instructions -from config import Config - -CFG = Config() +from typing import Optional -openai.api_key = CFG.openai_api_key +from gpt_researcher.master.prompts import auto_agent_instructions -from typing import Optional -import logging -def create_chat_completion( - messages: list, # type: ignore - model: Optional[str] = None, - temperature: float = CFG.temperature, - max_tokens: Optional[int] = None, - stream: Optional[bool] = False, - websocket: WebSocket | None = None, +async def create_chat_completion( + messages: list, # type: ignore + model: Optional[str] = None, + temperature: float = 1.0, + max_tokens: Optional[int] = None, + llm_provider: Optional[str] = None, + stream: Optional[bool] = False, + websocket: WebSocket | None = None, ) -> str: """Create a chat completion using the OpenAI API Args: @@ -35,6 +25,8 @@ def create_chat_completion( temperature (float, optional): The temperature to use. Defaults to 0.9. max_tokens (int, optional): The max tokens to use. Defaults to None. stream (bool, optional): Whether to stream the response. Defaults to False. + llm_provider (str, optional): The LLM Provider to use. + webocket (WebSocket): The websocket used in the currect request Returns: str: The response from the chat completion """ @@ -44,13 +36,11 @@ def create_chat_completion( raise ValueError("Model cannot be None") if max_tokens is not None and max_tokens > 8001: raise ValueError(f"Max tokens cannot be more than 8001, but got {max_tokens}") - if stream and websocket is None: - raise ValueError("Websocket cannot be None when stream is True") # create response for attempt in range(10): # maximum of 10 attempts - response = send_chat_completion_request( - messages, model, temperature, max_tokens, stream, websocket + response = await send_chat_completion_request( + messages, model, temperature, max_tokens, stream, llm_provider, websocket ) return response @@ -58,33 +48,35 @@ def create_chat_completion( raise RuntimeError("Failed to get response from OpenAI API") -def send_chat_completion_request( - messages, model, temperature, max_tokens, stream, websocket +import logging + + +async def send_chat_completion_request( + messages, model, temperature, max_tokens, stream, llm_provider, websocket ): if not stream: result = lc_openai.ChatCompletion.create( - model=model, # Change model here to use different models + model=model, # Change model here to use different models messages=messages, temperature=temperature, max_tokens=max_tokens, - provider=CFG.llm_provider, # Change provider here to use a different API + provider=llm_provider, # Change provider here to use a different API ) return result["choices"][0]["message"]["content"] else: - return stream_response(model, messages, temperature, max_tokens, websocket) + return await stream_response(model, messages, temperature, max_tokens, llm_provider, websocket) -async def stream_response(model, messages, temperature, max_tokens, websocket): +async def stream_response(model, messages, temperature, max_tokens, llm_provider, websocket=None): paragraph = "" response = "" - print(f"streaming response...") for chunk in lc_openai.ChatCompletion.create( model=model, messages=messages, temperature=temperature, max_tokens=max_tokens, - provider=CFG.llm_provider, + provider=llm_provider, stream=True, ): content = chunk["choices"][0].get("delta", {}).get("content") @@ -92,33 +84,37 @@ async def stream_response(model, messages, temperature, max_tokens, websocket): response += content paragraph += content if "\n" in paragraph: - await websocket.send_json({"type": "report", "output": paragraph}) + if websocket is not None: + await websocket.send_json({"type": "report", "output": paragraph}) + else: + print(f"{Fore.GREEN}{paragraph}{Style.RESET_ALL}") paragraph = "" - print(f"streaming response complete") return response -def choose_agent(task: str) -> dict: - """Determines what agent should be used +def choose_agent(smart_llm_model: str, llm_provider: str, task: str) -> dict: + """Determines what server should be used Args: task (str): The research question the user asked + smart_llm_model (str): the llm model to be used + llm_provider (str): the llm provider used Returns: - agent - The agent that will be used - agent_role_prompt (str): The prompt for the agent + server - The server that will be used + agent_role_prompt (str): The prompt for the server """ try: response = create_chat_completion( - model=CFG.smart_llm_model, + model=smart_llm_model, messages=[ {"role": "system", "content": f"{auto_agent_instructions()}"}, {"role": "user", "content": f"task: {task}"}], temperature=0, + llm_provider=llm_provider ) - - return json.loads(response) + agent_dict = json.loads(response) + print(f"Agent: {agent_dict.get('server')}") + return agent_dict except Exception as e: print(f"{Fore.RED}Error in choose_agent: {e}{Style.RESET_ALL}") - return {"agent": "Default Agent", + return {"server": "Default Agent", "agent_role_prompt": "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text."} - - diff --git a/gpt_researcher/utils/websocket_manager.py b/gpt_researcher/utils/websocket_manager.py new file mode 100644 index 000000000..bfdcdd4a4 --- /dev/null +++ b/gpt_researcher/utils/websocket_manager.py @@ -0,0 +1,68 @@ +# connect any client to gpt-researcher using websocket +import asyncio +import datetime +from typing import List, Dict +from fastapi import WebSocket +from gpt_researcher.master.agent import GPTResearcher + + +class WebSocketManager: + """Manage websockets""" + def __init__(self): + """Initialize the WebSocketManager class.""" + self.active_connections: List[WebSocket] = [] + self.sender_tasks: Dict[WebSocket, asyncio.Task] = {} + self.message_queues: Dict[WebSocket, asyncio.Queue] = {} + + async def start_sender(self, websocket: WebSocket): + """Start the sender task.""" + queue = self.message_queues.get(websocket) + if not queue: + return + + while True: + message = await queue.get() + if websocket in self.active_connections: + try: + await websocket.send_text(message) + except: + break + else: + break + + async def connect(self, websocket: WebSocket): + """Connect a websocket.""" + await websocket.accept() + self.active_connections.append(websocket) + self.message_queues[websocket] = asyncio.Queue() + self.sender_tasks[websocket] = asyncio.create_task(self.start_sender(websocket)) + + async def disconnect(self, websocket: WebSocket): + """Disconnect a websocket.""" + if websocket in self.active_connections: + self.active_connections.remove(websocket) + self.sender_tasks[websocket].cancel() + await self.message_queues[websocket].put(None) + del self.sender_tasks[websocket] + del self.message_queues[websocket] + + async def start_streaming(self, task, report_type, websocket): + """Start streaming the output.""" + report = await run_agent(task, report_type, websocket) + return report + + +async def run_agent(task, report_type, websocket): + """Run the agent.""" + # measure time + start_time = datetime.datetime.now() + # add customized JSON config file path here + config_path = None + # run agent + researcher = GPTResearcher(task, report_type, config_path, websocket) + report = await researcher.run() + # measure time + end_time = datetime.datetime.now() + await websocket.send_json({"type": "logs", "output": f"\nTotal run time: {end_time - start_time}\n"}) + + return report diff --git a/main.py b/main.py index c418ccd20..b737e72df 100644 --- a/main.py +++ b/main.py @@ -1,71 +1,6 @@ -from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect -from fastapi.staticfiles import StaticFiles -from fastapi.templating import Jinja2Templates -from pydantic import BaseModel -import json -import os - -from agent.llm_utils import choose_agent -from agent.run import WebSocketManager - - -class ResearchRequest(BaseModel): - task: str - report_type: str - agent: str - - - -app = FastAPI() -app.mount("/site", StaticFiles(directory="client"), name="site") -app.mount("/static", StaticFiles(directory="client/static"), name="static") -# Dynamic directory for outputs once first research is run -@app.on_event("startup") -def startup_event(): - if not os.path.isdir("outputs"): - os.makedirs("outputs") - app.mount("/outputs", StaticFiles(directory="outputs"), name="outputs") - -templates = Jinja2Templates(directory="client") - -manager = WebSocketManager() - - -@app.get("/") -async def read_root(request: Request): - return templates.TemplateResponse('index.html', {"request": request, "report": None}) - - -@app.websocket("/ws") -async def websocket_endpoint(websocket: WebSocket): - await manager.connect(websocket) - try: - while True: - data = await websocket.receive_text() - if data.startswith("start"): - json_data = json.loads(data[6:]) - task = json_data.get("task") - report_type = json_data.get("report_type") - agent = json_data.get("agent") - # temporary so "normal agents" can still be used and not just auto generated, will be removed when we move to auto generated - if agent == "Auto Agent": - agent_dict = choose_agent(task) - agent = agent_dict.get("agent") - agent_role_prompt = agent_dict.get("agent_role_prompt") - else: - agent_role_prompt = None - - await websocket.send_json({"type": "logs", "output": f"Initiated an Agent: {agent}"}) - if task and report_type and agent: - await manager.start_streaming(task, report_type, agent, agent_role_prompt, websocket) - else: - print("Error: not enough parameters provided.") - - except WebSocketDisconnect: - await manager.disconnect(websocket) - +from backend.server import app if __name__ == "__main__": import uvicorn - uvicorn.run(app, host="0.0.0.0", port=8000) + uvicorn.run(app, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 28a1dfeb2..25ca1187a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,22 +2,23 @@ asyncio==3.4.3 beautifulsoup4==4.12.2 colorama==0.4.6 -duckduckgo_search==3.9.3 +duckduckgo_search==3.9.5 md2pdf==1.0.1 -openai~=0.28.1 -playwright==1.38.0 +openai~=1.2.3 +playwright==1.39.0 python-dotenv~=1.0.0 pyyaml==6.0.1 -selenium +selenium==4.15.2 webdriver-manager==4.0.1 -flask -uvicorn -pydantic -fastapi -python-multipart -markdown -langchain==0.0.308 -tavily-python==0.2.4 +uvicorn==0.24.0.post1 +pydantic==2.4.2 +fastapi==0.104.1 +python-multipart==0.0.6 +markdown==3.5.1 +langchain==0.0.335 +tavily-python==0.2.6 permchain==0.0.3 -arxiv -pymupdf +arxiv==2.0.0 +PyMuPDF==1.23.6 +requests==2.31.0 +jinja2==3.1.2 \ No newline at end of file diff --git a/processing/__init__.py b/scraping/__init__.py similarity index 100% rename from processing/__init__.py rename to scraping/__init__.py diff --git a/js/overlay.js b/scraping/js/overlay.js similarity index 100% rename from js/overlay.js rename to scraping/js/overlay.js diff --git a/scraping/processing/__init__.py b/scraping/processing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/processing/html.py b/scraping/processing/html.py similarity index 100% rename from processing/html.py rename to scraping/processing/html.py diff --git a/processing/text.py b/scraping/processing/text.py similarity index 88% rename from processing/text.py rename to scraping/processing/text.py index a9ab907f2..28ac8ef9d 100644 --- a/processing/text.py +++ b/scraping/processing/text.py @@ -1,17 +1,14 @@ """Text processing functions""" import urllib from typing import Dict, Generator, Optional -import string from selenium.webdriver.remote.webdriver import WebDriver from config import Config -from agent.llm_utils import create_chat_completion +from gpt_researcher_old.retriever.llm_utils import create_chat_completion import os from md2pdf.core import md2pdf -CFG = Config() - def split_text(text: str, max_length: int = 8192) -> Generator[str, None, None]: """Split text into chunks of a maximum length @@ -44,11 +41,14 @@ def split_text(text: str, max_length: int = 8192) -> Generator[str, None, None]: def summarize_text( - url: str, text: str, question: str, driver: Optional[WebDriver] = None + fast_llm_model: str, summary_token_limit: int, llm_provider: str, url: str, text: str, question: str, driver: Optional[WebDriver] = None ) -> str: """Summarize text using the OpenAI API Args: + fast_llm_model (str): The fast LLM model e.g gpt3.5-turbo-16k + summary_token_limit (int): The summary token limit + llm_provider (str): The llm provider url (str): The url of the text text (str): The text to summarize question (str): The question to ask the model @@ -76,9 +76,10 @@ def summarize_text( messages = [create_message(chunk, question)] summary = create_chat_completion( - model=CFG.fast_llm_model, + model=fast_llm_model, messages=messages, - max_tokens=CFG.summary_token_limit + max_tokens=summary_token_limit, + llm_provider=llm_provider ) summaries.append(summary) #memory_to_add = f"Source: {url}\n" f"Content summary part#{i + 1}: {summary}" @@ -89,9 +90,10 @@ def summarize_text( messages = [create_message(combined_summary, question)] final_summary = create_chat_completion( - model=CFG.fast_llm_model, + model=fast_llm_model, messages=messages, - max_tokens=CFG.summary_token_limit + max_tokens=summary_token_limit, + llm_provider=llm_provider, ) print("Final summary length: ", len(combined_summary)) print(final_summary) diff --git a/scraping/scrape_skills.py b/scraping/scrape_skills.py new file mode 100644 index 000000000..0d02d4422 --- /dev/null +++ b/scraping/scrape_skills.py @@ -0,0 +1,31 @@ +from langchain.document_loaders import PyMuPDFLoader +from langchain.retrievers import ArxivRetriever + + +def scrape_pdf_with_pymupdf(url) -> str: + """Scrape a pdf with pymupdf + + Args: + url (str): The url of the pdf to scrape + + Returns: + str: The text scraped from the pdf + """ + loader = PyMuPDFLoader(url) + doc = loader.load() + return str(doc) + + +def scrape_pdf_with_arxiv(query) -> str: + """Scrape a pdf with arxiv + default document length of 70000 about ~15 pages or None for no limit + + Args: + query (str): The query to search for + + Returns: + str: The text scraped from the pdf + """ + retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) + docs = retriever.get_relevant_documents(query=query) + return docs[0].page_content \ No newline at end of file diff --git a/actions/web_scrape.py b/scraping/web_scrape.py similarity index 77% rename from actions/web_scrape.py rename to scraping/web_scrape.py index 97a5c38b6..3c038fc12 100644 --- a/actions/web_scrape.py +++ b/scraping/web_scrape.py @@ -16,26 +16,33 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from fastapi import WebSocket -from langchain.document_loaders import PyMuPDFLoader -from langchain.retrievers import ArxivRetriever -import processing.text as summary - -from config import Config -from processing.html import extract_hyperlinks, format_hyperlinks +from scraping import scrape_skills, processing as summary +from scraping.processing.html import extract_hyperlinks, format_hyperlinks from concurrent.futures import ThreadPoolExecutor +from scraping.processing.text import summarize_text + executor = ThreadPoolExecutor() FILE_DIR = Path(__file__).parent.parent -CFG = Config() -async def async_browse(url: str, question: str, websocket: WebSocket) -> str: +async def async_browse( + selenium_web_browser: str, + user_agent: str, + fast_llm_model: str, + summary_token_limit: str, + llm_provider: str, + url: str, question: str, + websocket: WebSocket +) -> str: """Browse a website and return the answer and links to the user Args: + selenium_web_browser (str): The web browser used for scraping + user_agent (str): The user agent used when scraping url (str): The url of the website to browse question (str): The question asked by the user websocket (WebSocketManager): The websocket manager @@ -47,28 +54,33 @@ async def async_browse(url: str, question: str, websocket: WebSocket) -> str: executor = ThreadPoolExecutor(max_workers=8) print(f"Scraping url {url} with question {question}") - await websocket.send_json( - { - "type": "logs", - "output": f"π Browsing the {url} for relevant about: {question}...", - } - ) + if websocket: + await websocket.send_json( + { + "type": "logs", + "output": f"π Browsing the {url} for relevant about: {question}...", + } + ) + else: + print(f"π Browsing the {url} for relevant about: {question}...") try: driver, text = await loop.run_in_executor( - executor, scrape_text_with_selenium, url + executor, scrape_text_with_selenium, selenium_web_browser, user_agent, url ) await loop.run_in_executor(executor, add_header, driver) summary_text = await loop.run_in_executor( - executor, summary.summarize_text, url, text, question, driver - ) - - await websocket.send_json( - { - "type": "logs", - "output": f"π Information gathered from url {url}: {summary_text}", - } + executor, summarize_text, fast_llm_model, summary_token_limit, llm_provider, url, text, question, driver ) + if websocket: + await websocket.send_json( + { + "type": "logs", + "output": f"π Information gathered from url {url}: {summary_text}", + } + ) + else: + print(f"π Information gathered from url {url}: {summary_text}") return f"Information gathered from url {url}: {summary_text}" except Exception as e: @@ -106,11 +118,13 @@ def browse_website(url: str, question: str) -> tuple[str, WebDriver]: return f"Answer gathered from website: {summary_text} \n \n Links: {links}", driver -def scrape_text_with_selenium(url: str) -> tuple[WebDriver, str]: +def scrape_text_with_selenium(selenium_web_browser: str, user_agent: str, url: str) -> tuple[WebDriver, str]: """Scrape text from a website using selenium Args: url (str): The url of the website to scrape + selenium_web_browser (str): The web browser used to scrape + user_agent (str): The user agent used when scraping Returns: Tuple[WebDriver, str]: The webdriver and the text scraped from the website @@ -123,14 +137,14 @@ def scrape_text_with_selenium(url: str) -> tuple[WebDriver, str]: "firefox": FirefoxOptions, } - options = options_available[CFG.selenium_web_browser]() - options.add_argument(f"user-agent={CFG.user_agent}") + options = options_available[selenium_web_browser]() + options.add_argument(f"user-agent={user_agent}") options.add_argument("--headless") options.add_argument("--enable-javascript") - if CFG.selenium_web_browser == "firefox": + if selenium_web_browser == "firefox": driver = webdriver.Firefox(options=options) - elif CFG.selenium_web_browser == "safari": + elif selenium_web_browser == "safari": # Requires a bit more setup on the users end # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari driver = webdriver.Safari(options=options) @@ -151,11 +165,11 @@ def scrape_text_with_selenium(url: str) -> tuple[WebDriver, str]: # check if url is a pdf or arxiv link if url.endswith(".pdf"): - text = scrape_pdf_with_pymupdf(url) + text = scrape_skills.scrape_pdf_with_pymupdf(url) elif "arxiv" in url: # parse the document number from the url doc_num = url.split("/")[-1] - text = scrape_pdf_with_arxiv(doc_num) + text = scrape_skills.scrape_pdf_with_arxiv(doc_num) else: # Get the HTML content directly from the browser's DOM page_source = driver.execute_script("return document.body.outerHTML;") @@ -231,32 +245,3 @@ def add_header(driver: WebDriver) -> None: None """ driver.execute_script(open(f"{FILE_DIR}/js/overlay.js", "r").read()) - - -def scrape_pdf_with_pymupdf(url) -> str: - """Scrape a pdf with pymupdf - - Args: - url (str): The url of the pdf to scrape - - Returns: - str: The text scraped from the pdf - """ - loader = PyMuPDFLoader(url) - doc = loader.load() - return str(doc) - - -def scrape_pdf_with_arxiv(query) -> str: - """Scrape a pdf with arxiv - default document length of 70000 about ~15 pages or None for no limit - - Args: - query (str): The query to search for - - Returns: - str: The text scraped from the pdf - """ - retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) - docs = retriever.get_relevant_documents(query=query) - return docs[0].page_content \ No newline at end of file