From dfab5006e90c45991289c30082cb6abd26d43460 Mon Sep 17 00:00:00 2001 From: Makesh Srinivasan <66047630+Makesh-Srinivasan@users.noreply.github.com> Date: Wed, 7 Aug 2024 00:51:11 -0400 Subject: [PATCH 01/10] Fix source_urls and add add_additional_sources --- gpt_researcher/master/agent.py | 577 +++++++++++++++++++++++++++++++++ 1 file changed, 577 insertions(+) create mode 100644 gpt_researcher/master/agent.py diff --git a/gpt_researcher/master/agent.py b/gpt_researcher/master/agent.py new file mode 100644 index 000000000..a7f6e517f --- /dev/null +++ b/gpt_researcher/master/agent.py @@ -0,0 +1,577 @@ +import asyncio +import time + +from typing import Set + +from gpt_researcher.config import Config +from gpt_researcher.context.compression import ContextCompressor, WrittenContentCompressor +from gpt_researcher.document import DocumentLoader, LangChainDocumentLoader +from gpt_researcher.master.actions import * +from gpt_researcher.memory import Memory +from gpt_researcher.utils.enum import ReportSource, ReportType, Tone + + +class GPTResearcher: + """ + GPT Researcher + """ + + def __init__( + self, + query: str, + report_type: str = ReportType.ResearchReport.value, + report_source=ReportSource.Web.value, + tone: Tone = Tone.Objective, + source_urls=None, + add_additional_sources=False, + documents=None, + config_path=None, + websocket=None, + agent=None, + role=None, + parent_query: str = "", + subtopics: list = [], + visited_urls: set = set(), + verbose: bool = True, + context=[], + headers: dict = None, # Add headers parameter + ): + """ + Initializes the GPTResearcher class with the specified parameters to set up the research environment. + + Args: + query (str): The main query for which research is conducted. + report_type (str): Type of report to generate. Defaults to a research report. + report_source (str): The source of data for research. Defaults to web sources. + tone (Tone): The tone of the report; objective by default. + source_urls (list or None): Initial list of URLs for research. + add_additional_sources (bool): Whether to add additional sources/links to the research. Set a non-empty valid value for source_urls for this parameter to take effect. + documents (list or None): Predefined list of documents to use. + config_path (str or None): Path to the configuration file. + websocket: Websocket connection for real-time updates. + agent (str or None): Designated agent for conducting research. + role (str or None): Role of the agent if specified. + parent_query (str): Main query that this query is derived from if any. + subtopics (list): List of subtopics related to the main query from the user. + visited_urls (set): Set of URLs that have already been visited. + verbose (bool): Toggle for verbose output for debugging or detailed logs. + context (list): Initial context for the research. + headers (dict or None): HTTP headers for web requests. + + Initializes internal state and prepares the gptr with necessary configuration. + """ + + self.headers = headers or {} + self.query: str = query + self.agent: str = agent + self.role: str = role + self.report_type: str = report_type + self.report_prompt: str = get_prompt_by_report_type( + self.report_type + ) # this validates the report type + self.report_source: str = report_source + self.research_costs: float = 0.0 + self.cfg = Config(config_path) + self.retriever = get_retriever(self.headers.get("retriever")) or get_retriever( + self.cfg.retriever + ) or get_default_retriever() + self.context = context + self.source_urls = source_urls + self.add_additional_sources: bool = add_additional_sources + self.documents = documents + self.memory = Memory(self.cfg.embedding_provider, self.headers) + self.visited_urls: set[str] = visited_urls + self.verbose: bool = verbose + self.websocket = websocket + self.headers = headers or {} + # Ensure tone is an instance of Tone enum + if isinstance(tone, dict): + print(f"Invalid tone format: {tone}. Setting to default Tone.Objective.") + self.tone = Tone.Objective + elif isinstance(tone, str): + self.tone = Tone[tone] + else: + self.tone = tone + + # Only relevant for DETAILED REPORTS + # -------------------------------------- + + # Stores the main query of the detailed report + self.parent_query = parent_query + + # Stores all the user provided subtopics + self.subtopics = subtopics + + async def conduct_research(self): + """ + Runs the GPT Researcher to conduct research on the specified source + """ + # Reset visited_urls and source_urls at the start of each research task + self.visited_urls.clear() + + if self.verbose: + await stream_output( + "logs", + "starting_research", + f"πŸ”Ž Starting the research task for '{self.query}'...", + self.websocket, + ) + + # Generate Agent + if not (self.agent and self.role): + self.agent, self.role = await choose_agent( + query=self.query, + cfg=self.cfg, + parent_query=self.parent_query, + cost_callback=self.add_costs, + headers=self.headers, + ) + + if self.verbose: + await stream_output("logs", "agent_generated", self.agent, self.websocket) + + # If specified, the researcher will use the given urls as the context for the research. + if self.source_urls: + self.context = await self.__get_context_by_urls(self.source_urls) + if len(self.context) == 0 and self.verbose: + # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge + await stream_output( + "logs", + "answering_from_memory", + f"🧐 I was unable to find relevant context in the provided sources...", + self.websocket, + ) + # If add_additional_sources parameter is set, more resources can be gathered to create additional context using default web search + if self.add_additional_sources: + additional_research = await self.__get_context_by_search(self.query) + self.context += ' '.join(additional_research) + + elif self.report_source == ReportSource.Local.value: + document_data = await DocumentLoader(self.cfg.doc_path).load() + self.context = await self.__get_context_by_search(self.query, document_data) + + elif self.report_source == ReportSource.LangChainDocuments.value: + langchain_documents_data = await LangChainDocumentLoader( + self.documents + ).load() + self.context = await self.__get_context_by_search( + self.query, langchain_documents_data + ) + + # Default web based research + else: + self.context = await self.__get_context_by_search(self.query) + + time.sleep(2) + if self.verbose: + await stream_output( + "logs", + "research_step_finalized", + f"Finalized research step.\nπŸ’Έ Total Research Costs: ${self.get_costs()}", + self.websocket, + ) + + return self.context + + async def write_report(self, existing_headers: list = [], relevant_written_contents: list = []): + """ + Writes the report based on research conducted + + Returns: + str: The report + """ + report = "" + + if self.verbose: + await stream_output( + "logs", + "task_summary_coming_up", + f"✍️ Writing summary for research task: {self.query} (this may take a few minutes)...", + self.websocket, + ) + + if self.report_type == "custom_report": + self.role = self.cfg.agent_role if self.cfg.agent_role else self.role + report = await generate_report( + query=self.query, + context=self.context, + agent_role_prompt=self.role, + report_type=self.report_type, + report_source=self.report_source, + tone=self.tone, + websocket=self.websocket, + cfg=self.cfg, + headers=self.headers, + ) + elif self.report_type == "subtopic_report": + report = await generate_report( + query=self.query, + context=self.context, + agent_role_prompt=self.role, + report_type=self.report_type, + report_source=self.report_source, + websocket=self.websocket, + tone=self.tone, + cfg=self.cfg, + main_topic=self.parent_query, + existing_headers=existing_headers, + relevant_written_contents=relevant_written_contents, + cost_callback=self.add_costs, + headers=self.headers, + ) + else: + report = await generate_report( + query=self.query, + context=self.context, + agent_role_prompt=self.role, + report_type=self.report_type, + report_source=self.report_source, + tone=self.tone, + websocket=self.websocket, + cfg=self.cfg, + cost_callback=self.add_costs, + headers=self.headers, + ) + + return report + + async def __get_context_by_urls(self, urls): + """ + Scrapes and compresses the context from the given urls + """ + new_search_urls = await self.__get_new_urls(urls) + if self.verbose: + await stream_output( + "logs", + "source_urls", + f"πŸ—‚οΈ I will conduct my research based on the following urls: {new_search_urls}...", + self.websocket, + ) + + scraped_sites = scrape_urls(new_search_urls, self.cfg) + return await self.__get_similar_content_by_query(self.query, scraped_sites) + + async def __get_context_by_search(self, query, scraped_data: list = []): + """ + Generates the context for the research task by searching the query and scraping the results + Returns: + context: List of context + """ + context = [] + # Generate Sub-Queries including original query + sub_queries = await get_sub_queries( + query=query, + agent_role_prompt=self.role, + cfg=self.cfg, + parent_query=self.parent_query, + report_type=self.report_type, + cost_callback=self.add_costs, + openai_api_key=self.headers.get("openai_api_key"), + ) + + # If this is not part of a sub researcher, add original query to research for better results + if self.report_type != "subtopic_report": + sub_queries.append(query) + + if self.verbose: + await stream_output( + "logs", + "subqueries", + f"πŸ—‚οΈ I will conduct my research based on the following queries: {sub_queries}...", + self.websocket, + True, + sub_queries, + ) + + # Using asyncio.gather to process the sub_queries asynchronously + context = await asyncio.gather( + *[ + self.__process_sub_query(sub_query, scraped_data) + for sub_query in sub_queries + ] + ) + return context + + async def __process_sub_query(self, sub_query: str, scraped_data: list = []): + """Takes in a sub query and scrapes urls based on it and gathers context. + + Args: + sub_query (str): The sub-query generated from the original query + scraped_data (list): Scraped data passed in + + Returns: + str: The context gathered from search + """ + if self.verbose: + await stream_output( + "logs", + "running_subquery_research", + f"\nπŸ” Running research for '{sub_query}'...", + self.websocket, + ) + + if not scraped_data: + scraped_data = await self.__scrape_data_by_query(sub_query) + + content = await self.__get_similar_content_by_query(sub_query, scraped_data) + + if content and self.verbose: + await stream_output( + "logs", "subquery_context_window", f"πŸ“ƒ {content}", self.websocket + ) + elif self.verbose: + await stream_output( + "logs", + "subquery_context_not_found", + f"🀷 No content found for '{sub_query}'...", + self.websocket, + ) + return content + + async def __get_new_urls(self, url_set_input): + """Gets the new urls from the given url set. + Args: url_set_input (set[str]): The url set to get the new urls from + Returns: list[str]: The new urls from the given url set + """ + + new_urls = [] + for url in url_set_input: + if url not in self.visited_urls: + self.visited_urls.add(url) + new_urls.append(url) + if self.verbose: + await stream_output( + "logs", + "added_source_url", + f"βœ… Added source url to research: {url}\n", + self.websocket, + True, + url, + ) + + return new_urls + + async def __scrape_data_by_query(self, sub_query): + """ + Runs a sub-query + Args: + sub_query: + + Returns: + Summary + """ + # Get Urls + retriever = self.retriever(sub_query) + search_results = await asyncio.to_thread( + retriever.search, max_results=self.cfg.max_search_results_per_query + ) + new_search_urls = await self.__get_new_urls( + [url.get("href") for url in search_results] + ) + + # Scrape Urls + if self.verbose: + await stream_output( + "logs", + "researching", + f"πŸ€” Researching for relevant information...\n", + self.websocket, + ) + + # Scrape Urls + scraped_content_results = await asyncio.to_thread( + scrape_urls, new_search_urls, self.cfg + ) + return scraped_content_results + + async def __get_similar_content_by_query(self, query, pages): + if self.verbose: + await stream_output( + "logs", + "fetching_query_content", + f"πŸ“š Getting relevant content based on query: {query}...", + self.websocket, + ) + + # Summarize Raw Data + context_compressor = ContextCompressor( + documents=pages, embeddings=self.memory.get_embeddings() + ) + # Run Tasks + return await context_compressor.async_get_context( + query=query, max_results=8, cost_callback=self.add_costs + ) + + ######################################################################################## + + # GETTERS & SETTERS + def get_source_urls(self) -> list: + return list(self.visited_urls) + + def get_research_context(self) -> list: + return self.context + + def get_costs(self) -> float: + return self.research_costs + + def set_verbose(self, verbose: bool): + self.verbose = verbose + + def add_costs(self, cost: int) -> None: + if not isinstance(cost, float) and not isinstance(cost, int): + raise ValueError("Cost must be an integer or float") + self.research_costs += cost + + ######################################################################################## + + # DETAILED REPORT + + async def write_introduction(self): + # Construct Report Introduction from main topic research + introduction = await get_report_introduction( + self.query, + self.context, + self.role, + self.cfg, + self.websocket, + self.add_costs, + ) + + return introduction + + async def get_subtopics(self): + """ + This async function generates subtopics based on user input and other parameters. + + Returns: + The `get_subtopics` function is returning the `subtopics` that are generated by the + `construct_subtopics` function. + """ + if self.verbose: + await stream_output( + "logs", + "generating_subtopics", + f"πŸ€” Generating subtopics...", + self.websocket, + ) + + subtopics = await construct_subtopics( + task=self.query, + data=self.context, + config=self.cfg, + # This is a list of user provided subtopics + subtopics=self.subtopics, + ) + + if self.verbose: + await stream_output( + "logs", "subtopics", f"πŸ“‹Subtopics: {subtopics}", self.websocket + ) + + return subtopics + + async def get_draft_section_titles(self): + """ + Writes the draft section titles based on research conducted. The draft section titles are used to retrieve the previous relevant written contents. + + Returns: + str: The headers markdown text + """ + if self.verbose: + await stream_output( + "logs", + "task_summary_coming_up", + f"✍️ Writing draft section titles for research task: {self.query}...", + self.websocket, + ) + + draft_section_titles = await generate_draft_section_titles( + query=self.query, + context=self.context, + agent_role_prompt=self.role, + report_type=self.report_type, + websocket=self.websocket, + cfg=self.cfg, + main_topic=self.parent_query, + cost_callback=self.add_costs, + headers=self.headers, + ) + + return draft_section_titles + + async def __get_similar_written_contents_by_query(self, + query: str, + written_contents: List[Dict], + similarity_threshold: float = 0.5, + max_results: int = 10 + ) -> List[str]: + """ + Asynchronously retrieves similar written contents based on a given query. + + Args: + query (str): The query to search for similar written contents. + written_contents (List[Dict]): List of written contents to search through. + similarity_threshold (float, optional): The minimum similarity score for content to be considered relevant. + Defaults to 0.5. + max_results (int, optional): The maximum number of similar contents to return. Defaults to 10. + + Returns: + List[str]: A list of similar written contents, limited by max_results. + """ + if self.verbose: + await stream_output( + "logs", + "fetching_relevant_written_content", + f"πŸ”Ž Getting relevant written content based on query: {query}...", + self.websocket, + ) + + # Retrieve similar written contents based on the query + # Use a higher similarity threshold to ensure more relevant results and reduce irrelevant matches + written_content_compressor = WrittenContentCompressor( + documents=written_contents, embeddings=self.memory.get_embeddings(), similarity_threshold=similarity_threshold + ) + return await written_content_compressor.async_get_context( + query=query, max_results=max_results, cost_callback=self.add_costs + ) + + async def get_similar_written_contents_by_draft_section_titles( + self, + current_subtopic: str, + draft_section_titles: List[str], + written_contents: List[Dict], + max_results: int = 10 + ) -> List[str]: + """ + Retrieve similar written contents based on current subtopic and draft section titles. + + Args: + current_subtopic (str): The current subtopic. + draft_section_titles (List[str]): List of draft section titles. + written_contents (List[Dict]): List of written contents to search through. + max_results (int): Maximum number of results to return. Defaults to 10. + + Returns: + List[str]: List of relevant written contents. + """ + all_queries = [current_subtopic] + draft_section_titles + + async def process_query(query: str) -> Set[str]: + return set(await self.__get_similar_written_contents_by_query(query, written_contents)) + + # Run all queries in parallel + results = await asyncio.gather(*[process_query(query) for query in all_queries]) + + # Combine all results + relevant_contents = set().union(*results) + + # Limit the number of results + relevant_contents = list(relevant_contents)[:max_results] + + if relevant_contents and self.verbose: + prettier_contents = "\n".join(relevant_contents) + await stream_output( + "logs", "relevant_contents_context", f"πŸ“ƒ {prettier_contents}", self.websocket + ) + + return relevant_contents \ No newline at end of file From f9c991a2f90582d6ef45bd09c67610270b951363 Mon Sep 17 00:00:00 2001 From: Makesh Srinivasan <66047630+Makesh-Srinivasan@users.noreply.github.com> Date: Wed, 7 Aug 2024 00:51:51 -0400 Subject: [PATCH 02/10] add test case for the parameter add_additional_sources --- tests/research_test.py | 103 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 tests/research_test.py diff --git a/tests/research_test.py b/tests/research_test.py new file mode 100644 index 000000000..ec5fce21c --- /dev/null +++ b/tests/research_test.py @@ -0,0 +1,103 @@ +""" +Hi! The following test cases are for the new parameter `add_additional_sources` and fix on the functional error with `source_urls` in GPTResearcher class. + +The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced. +This parameter named will `add_additional_sources` allow GPTR to research on sources other than the provided sources via source_urls if set to True. +Default is False, i.e., no additional research will be conducted on newer sources. +""" + +## Notes: +## Please uncomment the test case to run and comment the rest. +## Thanks! + + + +#### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research) + +from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +import asyncio + +async def get_report(query: str, report_type: str, sources: list) -> str: + researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) + await researcher.conduct_research() + report = await researcher.write_report() + return report, researcher + +if __name__ == "__main__": + query = "Research the latest advancements in AI and provide a detailed report in APA format including sources." + report_type = "research_report" + sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related + + report, researcher = asyncio.run(get_report(query, report_type, sources)) + print(report) + + print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, so there will be relevant context present + + + +#### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context) + +# from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What is Microsoft's business model?" +# report_type = "research_report" +# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED. + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say 0 (zero) value because the query is UNRELATED to the contents of the pages, so there will be NO relevant context present + + + +#### Test case 3 (Suggested solution - add_additional_sources parameter allows GPTR to scour more of the web and not restrict to source_urls) + +# from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=True) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What is Microsoft's business model?" +# report_type = "research_report" +# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the add_additional_sources is set which should make gptr do default web search to gather contexts + + + +# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the add_additional_sources parameter is set allowing for a larger research scope) + +# from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=True) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What are the latest advancements in AI?" +# report_type = "research_report" +# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the add_additional_sources is set which should make gptr do default web search to gather more contexts! From 7e07ec69601f9aeef7b936805a9b10a291e9e6c9 Mon Sep 17 00:00:00 2001 From: Makesh Srinivasan <66047630+Makesh-Srinivasan@users.noreply.github.com> Date: Wed, 7 Aug 2024 02:31:16 -0400 Subject: [PATCH 03/10] update research on specific sources with source_urls and add_additional_sources --- .../gpt-researcher/context/tailored-research.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/docs/gpt-researcher/context/tailored-research.md b/docs/docs/gpt-researcher/context/tailored-research.md index d6972652d..bed251cd6 100644 --- a/docs/docs/gpt-researcher/context/tailored-research.md +++ b/docs/docs/gpt-researcher/context/tailored-research.md @@ -1,20 +1,23 @@ # Tailored Research -The GPT Researcher package allows you to tailor the research to your needs such as researching on specific sources or local documents, and even specify the agent prompt instruction upon which the research is conducted. +The GPT Researcher package allows you to tailor the research to your needs such as researching on specific sources (URLs) or local documents, and even specify the agent prompt instruction upon which the research is conducted. ### Research on Specific Sources πŸ“š -You can specify the sources you want the GPT Researcher to research on by providing a list of URLs. GPT Researcher will then conduct research on the provided sources only. -Simply pass the sources as the `source_urls` argument to the `GPTResearcher` class and the "static" `report_source`. +You can specify the sources you want the GPT Researcher to research on by providing a list of URLs. The GPT Researcher will then conduct research on the provided sources via `source_urls`. + +If you want GPT Researcher to perform additional research outside of the URLs you provided, i.e., conduct research on various other websites that it finds suitable for the query/sub-query, you can set the parameter `add_additional_sources` as `True`. Default value of `False` will only scour the websites you provide via `source_urls`. + ```python from gpt_researcher import GPTResearcher import asyncio -async def get_report(query: str, report_source: str, sources: list) -> str: - researcher = GPTResearcher(query=query, report_source=report_source, source_urls=sources) - research_context = await researcher.conduct_research() - return await researcher.write_report() +async def get_report(query: str, report_type: str, sources: list) -> str: + researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=False) + await researcher.conduct_research() + report = await researcher.write_report() + return report if __name__ == "__main__": query = "What are the biggest trends in AI lately?" From 588879b8191d1e7af873be8df77328e7ff286073 Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Mon, 11 Nov 2024 17:48:45 +0200 Subject: [PATCH 04/10] moved source_urls logic to ResearchConductor --- gpt_researcher/agent.py | 2 ++ gpt_researcher/skills/researcher.py | 16 ++++++++++++++-- tests/__init__.py | 0 tests/research_test.py | 4 ++-- 4 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 tests/__init__.py diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py index 642afe62f..e8facf3ef 100644 --- a/gpt_researcher/agent.py +++ b/gpt_researcher/agent.py @@ -32,6 +32,7 @@ def __init__( report_source: str = ReportSource.Web.value, tone: Tone = Tone.Objective, source_urls=None, + add_additional_sources=False, documents=None, vector_store=None, vector_store_filter=None, @@ -57,6 +58,7 @@ def __init__( self.max_subtopics = max_subtopics self.tone = tone if isinstance(tone, Tone) else Tone.Objective self.source_urls = source_urls + self.add_additional_sources: bool = add_additional_sources self.research_sources = [] # The list of scraped sources including title, content and images self.research_images = [] # The list of selected research images self.documents = documents diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index 948f57329..aa7be121d 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -38,8 +38,20 @@ async def conduct_research(self): await stream_output("logs", "agent_generated", self.researcher.agent, self.researcher.websocket) # If specified, the researcher will use the given urls as the context for the research. - if self.researcher.source_urls: - self.researcher.context = await self.__get_context_by_urls(self.researcher.source_urls) + if self.source_urls: + self.context = await self.__get_context_by_urls(self.source_urls) + if len(self.context) == 0 and self.verbose: + # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge + await stream_output( + "logs", + "answering_from_memory", + f"🧐 I was unable to find relevant context in the provided sources...", + self.websocket, + ) + # If add_additional_sources parameter is set, more resources can be gathered to create additional context using default web search + if self.add_additional_sources: + additional_research = await self.__get_context_by_search(self.query) + self.context += ' '.join(additional_research) elif self.researcher.report_source == ReportSource.Local.value: document_data = await DocumentLoader(self.researcher.cfg.doc_path).load() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/research_test.py b/tests/research_test.py index ec5fce21c..b668994eb 100644 --- a/tests/research_test.py +++ b/tests/research_test.py @@ -14,11 +14,11 @@ #### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research) -from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +from gpt_researcher.agent import GPTResearcher # Ensure this path is correct import asyncio async def get_report(query: str, report_type: str, sources: list) -> str: - researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) + researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=False) await researcher.conduct_research() report = await researcher.write_report() return report, researcher From e093ffbbf7a73f32eadb88d87dcef5c19748827c Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Mon, 11 Nov 2024 18:00:32 +0200 Subject: [PATCH 05/10] run sources test from root with: python -m tests.research_test --- gpt_researcher/skills/researcher.py | 8 ++++---- tests/research_test.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index aa7be121d..79f579602 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -38,8 +38,8 @@ async def conduct_research(self): await stream_output("logs", "agent_generated", self.researcher.agent, self.researcher.websocket) # If specified, the researcher will use the given urls as the context for the research. - if self.source_urls: - self.context = await self.__get_context_by_urls(self.source_urls) + if self.researcher.source_urls: + self.context = await self.__get_context_by_urls(self.researcher.source_urls) if len(self.context) == 0 and self.verbose: # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge await stream_output( @@ -49,8 +49,8 @@ async def conduct_research(self): self.websocket, ) # If add_additional_sources parameter is set, more resources can be gathered to create additional context using default web search - if self.add_additional_sources: - additional_research = await self.__get_context_by_search(self.query) + if self.researcher.add_additional_sources: + additional_research = await self.__get_context_by_search(self.researcher.query) self.context += ' '.join(additional_research) elif self.researcher.report_source == ReportSource.Local.value: diff --git a/tests/research_test.py b/tests/research_test.py index b668994eb..595570273 100644 --- a/tests/research_test.py +++ b/tests/research_test.py @@ -25,7 +25,7 @@ async def get_report(query: str, report_type: str, sources: list) -> str: if __name__ == "__main__": query = "Research the latest advancements in AI and provide a detailed report in APA format including sources." - report_type = "research_report" + report_type = "sources" sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related report, researcher = asyncio.run(get_report(query, report_type, sources)) From 29d20ac7bc5abb47befacda1335f934c3189b2e6 Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Mon, 11 Nov 2024 18:50:39 +0200 Subject: [PATCH 06/10] remove extra file --- gpt_researcher/master/agent.py | 577 --------------------------------- 1 file changed, 577 deletions(-) delete mode 100644 gpt_researcher/master/agent.py diff --git a/gpt_researcher/master/agent.py b/gpt_researcher/master/agent.py deleted file mode 100644 index a7f6e517f..000000000 --- a/gpt_researcher/master/agent.py +++ /dev/null @@ -1,577 +0,0 @@ -import asyncio -import time - -from typing import Set - -from gpt_researcher.config import Config -from gpt_researcher.context.compression import ContextCompressor, WrittenContentCompressor -from gpt_researcher.document import DocumentLoader, LangChainDocumentLoader -from gpt_researcher.master.actions import * -from gpt_researcher.memory import Memory -from gpt_researcher.utils.enum import ReportSource, ReportType, Tone - - -class GPTResearcher: - """ - GPT Researcher - """ - - def __init__( - self, - query: str, - report_type: str = ReportType.ResearchReport.value, - report_source=ReportSource.Web.value, - tone: Tone = Tone.Objective, - source_urls=None, - add_additional_sources=False, - documents=None, - config_path=None, - websocket=None, - agent=None, - role=None, - parent_query: str = "", - subtopics: list = [], - visited_urls: set = set(), - verbose: bool = True, - context=[], - headers: dict = None, # Add headers parameter - ): - """ - Initializes the GPTResearcher class with the specified parameters to set up the research environment. - - Args: - query (str): The main query for which research is conducted. - report_type (str): Type of report to generate. Defaults to a research report. - report_source (str): The source of data for research. Defaults to web sources. - tone (Tone): The tone of the report; objective by default. - source_urls (list or None): Initial list of URLs for research. - add_additional_sources (bool): Whether to add additional sources/links to the research. Set a non-empty valid value for source_urls for this parameter to take effect. - documents (list or None): Predefined list of documents to use. - config_path (str or None): Path to the configuration file. - websocket: Websocket connection for real-time updates. - agent (str or None): Designated agent for conducting research. - role (str or None): Role of the agent if specified. - parent_query (str): Main query that this query is derived from if any. - subtopics (list): List of subtopics related to the main query from the user. - visited_urls (set): Set of URLs that have already been visited. - verbose (bool): Toggle for verbose output for debugging or detailed logs. - context (list): Initial context for the research. - headers (dict or None): HTTP headers for web requests. - - Initializes internal state and prepares the gptr with necessary configuration. - """ - - self.headers = headers or {} - self.query: str = query - self.agent: str = agent - self.role: str = role - self.report_type: str = report_type - self.report_prompt: str = get_prompt_by_report_type( - self.report_type - ) # this validates the report type - self.report_source: str = report_source - self.research_costs: float = 0.0 - self.cfg = Config(config_path) - self.retriever = get_retriever(self.headers.get("retriever")) or get_retriever( - self.cfg.retriever - ) or get_default_retriever() - self.context = context - self.source_urls = source_urls - self.add_additional_sources: bool = add_additional_sources - self.documents = documents - self.memory = Memory(self.cfg.embedding_provider, self.headers) - self.visited_urls: set[str] = visited_urls - self.verbose: bool = verbose - self.websocket = websocket - self.headers = headers or {} - # Ensure tone is an instance of Tone enum - if isinstance(tone, dict): - print(f"Invalid tone format: {tone}. Setting to default Tone.Objective.") - self.tone = Tone.Objective - elif isinstance(tone, str): - self.tone = Tone[tone] - else: - self.tone = tone - - # Only relevant for DETAILED REPORTS - # -------------------------------------- - - # Stores the main query of the detailed report - self.parent_query = parent_query - - # Stores all the user provided subtopics - self.subtopics = subtopics - - async def conduct_research(self): - """ - Runs the GPT Researcher to conduct research on the specified source - """ - # Reset visited_urls and source_urls at the start of each research task - self.visited_urls.clear() - - if self.verbose: - await stream_output( - "logs", - "starting_research", - f"πŸ”Ž Starting the research task for '{self.query}'...", - self.websocket, - ) - - # Generate Agent - if not (self.agent and self.role): - self.agent, self.role = await choose_agent( - query=self.query, - cfg=self.cfg, - parent_query=self.parent_query, - cost_callback=self.add_costs, - headers=self.headers, - ) - - if self.verbose: - await stream_output("logs", "agent_generated", self.agent, self.websocket) - - # If specified, the researcher will use the given urls as the context for the research. - if self.source_urls: - self.context = await self.__get_context_by_urls(self.source_urls) - if len(self.context) == 0 and self.verbose: - # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge - await stream_output( - "logs", - "answering_from_memory", - f"🧐 I was unable to find relevant context in the provided sources...", - self.websocket, - ) - # If add_additional_sources parameter is set, more resources can be gathered to create additional context using default web search - if self.add_additional_sources: - additional_research = await self.__get_context_by_search(self.query) - self.context += ' '.join(additional_research) - - elif self.report_source == ReportSource.Local.value: - document_data = await DocumentLoader(self.cfg.doc_path).load() - self.context = await self.__get_context_by_search(self.query, document_data) - - elif self.report_source == ReportSource.LangChainDocuments.value: - langchain_documents_data = await LangChainDocumentLoader( - self.documents - ).load() - self.context = await self.__get_context_by_search( - self.query, langchain_documents_data - ) - - # Default web based research - else: - self.context = await self.__get_context_by_search(self.query) - - time.sleep(2) - if self.verbose: - await stream_output( - "logs", - "research_step_finalized", - f"Finalized research step.\nπŸ’Έ Total Research Costs: ${self.get_costs()}", - self.websocket, - ) - - return self.context - - async def write_report(self, existing_headers: list = [], relevant_written_contents: list = []): - """ - Writes the report based on research conducted - - Returns: - str: The report - """ - report = "" - - if self.verbose: - await stream_output( - "logs", - "task_summary_coming_up", - f"✍️ Writing summary for research task: {self.query} (this may take a few minutes)...", - self.websocket, - ) - - if self.report_type == "custom_report": - self.role = self.cfg.agent_role if self.cfg.agent_role else self.role - report = await generate_report( - query=self.query, - context=self.context, - agent_role_prompt=self.role, - report_type=self.report_type, - report_source=self.report_source, - tone=self.tone, - websocket=self.websocket, - cfg=self.cfg, - headers=self.headers, - ) - elif self.report_type == "subtopic_report": - report = await generate_report( - query=self.query, - context=self.context, - agent_role_prompt=self.role, - report_type=self.report_type, - report_source=self.report_source, - websocket=self.websocket, - tone=self.tone, - cfg=self.cfg, - main_topic=self.parent_query, - existing_headers=existing_headers, - relevant_written_contents=relevant_written_contents, - cost_callback=self.add_costs, - headers=self.headers, - ) - else: - report = await generate_report( - query=self.query, - context=self.context, - agent_role_prompt=self.role, - report_type=self.report_type, - report_source=self.report_source, - tone=self.tone, - websocket=self.websocket, - cfg=self.cfg, - cost_callback=self.add_costs, - headers=self.headers, - ) - - return report - - async def __get_context_by_urls(self, urls): - """ - Scrapes and compresses the context from the given urls - """ - new_search_urls = await self.__get_new_urls(urls) - if self.verbose: - await stream_output( - "logs", - "source_urls", - f"πŸ—‚οΈ I will conduct my research based on the following urls: {new_search_urls}...", - self.websocket, - ) - - scraped_sites = scrape_urls(new_search_urls, self.cfg) - return await self.__get_similar_content_by_query(self.query, scraped_sites) - - async def __get_context_by_search(self, query, scraped_data: list = []): - """ - Generates the context for the research task by searching the query and scraping the results - Returns: - context: List of context - """ - context = [] - # Generate Sub-Queries including original query - sub_queries = await get_sub_queries( - query=query, - agent_role_prompt=self.role, - cfg=self.cfg, - parent_query=self.parent_query, - report_type=self.report_type, - cost_callback=self.add_costs, - openai_api_key=self.headers.get("openai_api_key"), - ) - - # If this is not part of a sub researcher, add original query to research for better results - if self.report_type != "subtopic_report": - sub_queries.append(query) - - if self.verbose: - await stream_output( - "logs", - "subqueries", - f"πŸ—‚οΈ I will conduct my research based on the following queries: {sub_queries}...", - self.websocket, - True, - sub_queries, - ) - - # Using asyncio.gather to process the sub_queries asynchronously - context = await asyncio.gather( - *[ - self.__process_sub_query(sub_query, scraped_data) - for sub_query in sub_queries - ] - ) - return context - - async def __process_sub_query(self, sub_query: str, scraped_data: list = []): - """Takes in a sub query and scrapes urls based on it and gathers context. - - Args: - sub_query (str): The sub-query generated from the original query - scraped_data (list): Scraped data passed in - - Returns: - str: The context gathered from search - """ - if self.verbose: - await stream_output( - "logs", - "running_subquery_research", - f"\nπŸ” Running research for '{sub_query}'...", - self.websocket, - ) - - if not scraped_data: - scraped_data = await self.__scrape_data_by_query(sub_query) - - content = await self.__get_similar_content_by_query(sub_query, scraped_data) - - if content and self.verbose: - await stream_output( - "logs", "subquery_context_window", f"πŸ“ƒ {content}", self.websocket - ) - elif self.verbose: - await stream_output( - "logs", - "subquery_context_not_found", - f"🀷 No content found for '{sub_query}'...", - self.websocket, - ) - return content - - async def __get_new_urls(self, url_set_input): - """Gets the new urls from the given url set. - Args: url_set_input (set[str]): The url set to get the new urls from - Returns: list[str]: The new urls from the given url set - """ - - new_urls = [] - for url in url_set_input: - if url not in self.visited_urls: - self.visited_urls.add(url) - new_urls.append(url) - if self.verbose: - await stream_output( - "logs", - "added_source_url", - f"βœ… Added source url to research: {url}\n", - self.websocket, - True, - url, - ) - - return new_urls - - async def __scrape_data_by_query(self, sub_query): - """ - Runs a sub-query - Args: - sub_query: - - Returns: - Summary - """ - # Get Urls - retriever = self.retriever(sub_query) - search_results = await asyncio.to_thread( - retriever.search, max_results=self.cfg.max_search_results_per_query - ) - new_search_urls = await self.__get_new_urls( - [url.get("href") for url in search_results] - ) - - # Scrape Urls - if self.verbose: - await stream_output( - "logs", - "researching", - f"πŸ€” Researching for relevant information...\n", - self.websocket, - ) - - # Scrape Urls - scraped_content_results = await asyncio.to_thread( - scrape_urls, new_search_urls, self.cfg - ) - return scraped_content_results - - async def __get_similar_content_by_query(self, query, pages): - if self.verbose: - await stream_output( - "logs", - "fetching_query_content", - f"πŸ“š Getting relevant content based on query: {query}...", - self.websocket, - ) - - # Summarize Raw Data - context_compressor = ContextCompressor( - documents=pages, embeddings=self.memory.get_embeddings() - ) - # Run Tasks - return await context_compressor.async_get_context( - query=query, max_results=8, cost_callback=self.add_costs - ) - - ######################################################################################## - - # GETTERS & SETTERS - def get_source_urls(self) -> list: - return list(self.visited_urls) - - def get_research_context(self) -> list: - return self.context - - def get_costs(self) -> float: - return self.research_costs - - def set_verbose(self, verbose: bool): - self.verbose = verbose - - def add_costs(self, cost: int) -> None: - if not isinstance(cost, float) and not isinstance(cost, int): - raise ValueError("Cost must be an integer or float") - self.research_costs += cost - - ######################################################################################## - - # DETAILED REPORT - - async def write_introduction(self): - # Construct Report Introduction from main topic research - introduction = await get_report_introduction( - self.query, - self.context, - self.role, - self.cfg, - self.websocket, - self.add_costs, - ) - - return introduction - - async def get_subtopics(self): - """ - This async function generates subtopics based on user input and other parameters. - - Returns: - The `get_subtopics` function is returning the `subtopics` that are generated by the - `construct_subtopics` function. - """ - if self.verbose: - await stream_output( - "logs", - "generating_subtopics", - f"πŸ€” Generating subtopics...", - self.websocket, - ) - - subtopics = await construct_subtopics( - task=self.query, - data=self.context, - config=self.cfg, - # This is a list of user provided subtopics - subtopics=self.subtopics, - ) - - if self.verbose: - await stream_output( - "logs", "subtopics", f"πŸ“‹Subtopics: {subtopics}", self.websocket - ) - - return subtopics - - async def get_draft_section_titles(self): - """ - Writes the draft section titles based on research conducted. The draft section titles are used to retrieve the previous relevant written contents. - - Returns: - str: The headers markdown text - """ - if self.verbose: - await stream_output( - "logs", - "task_summary_coming_up", - f"✍️ Writing draft section titles for research task: {self.query}...", - self.websocket, - ) - - draft_section_titles = await generate_draft_section_titles( - query=self.query, - context=self.context, - agent_role_prompt=self.role, - report_type=self.report_type, - websocket=self.websocket, - cfg=self.cfg, - main_topic=self.parent_query, - cost_callback=self.add_costs, - headers=self.headers, - ) - - return draft_section_titles - - async def __get_similar_written_contents_by_query(self, - query: str, - written_contents: List[Dict], - similarity_threshold: float = 0.5, - max_results: int = 10 - ) -> List[str]: - """ - Asynchronously retrieves similar written contents based on a given query. - - Args: - query (str): The query to search for similar written contents. - written_contents (List[Dict]): List of written contents to search through. - similarity_threshold (float, optional): The minimum similarity score for content to be considered relevant. - Defaults to 0.5. - max_results (int, optional): The maximum number of similar contents to return. Defaults to 10. - - Returns: - List[str]: A list of similar written contents, limited by max_results. - """ - if self.verbose: - await stream_output( - "logs", - "fetching_relevant_written_content", - f"πŸ”Ž Getting relevant written content based on query: {query}...", - self.websocket, - ) - - # Retrieve similar written contents based on the query - # Use a higher similarity threshold to ensure more relevant results and reduce irrelevant matches - written_content_compressor = WrittenContentCompressor( - documents=written_contents, embeddings=self.memory.get_embeddings(), similarity_threshold=similarity_threshold - ) - return await written_content_compressor.async_get_context( - query=query, max_results=max_results, cost_callback=self.add_costs - ) - - async def get_similar_written_contents_by_draft_section_titles( - self, - current_subtopic: str, - draft_section_titles: List[str], - written_contents: List[Dict], - max_results: int = 10 - ) -> List[str]: - """ - Retrieve similar written contents based on current subtopic and draft section titles. - - Args: - current_subtopic (str): The current subtopic. - draft_section_titles (List[str]): List of draft section titles. - written_contents (List[Dict]): List of written contents to search through. - max_results (int): Maximum number of results to return. Defaults to 10. - - Returns: - List[str]: List of relevant written contents. - """ - all_queries = [current_subtopic] + draft_section_titles - - async def process_query(query: str) -> Set[str]: - return set(await self.__get_similar_written_contents_by_query(query, written_contents)) - - # Run all queries in parallel - results = await asyncio.gather(*[process_query(query) for query in all_queries]) - - # Combine all results - relevant_contents = set().union(*results) - - # Limit the number of results - relevant_contents = list(relevant_contents)[:max_results] - - if relevant_contents and self.verbose: - prettier_contents = "\n".join(relevant_contents) - await stream_output( - "logs", "relevant_contents_context", f"πŸ“ƒ {prettier_contents}", self.websocket - ) - - return relevant_contents \ No newline at end of file From 013c7fd55eed5a8b8257ac5876f57e896b3d0638 Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Tue, 12 Nov 2024 07:49:09 +0200 Subject: [PATCH 07/10] limited sources when source_urls has any value --- gpt_researcher/skills/researcher.py | 6 +----- tests/report-types.py | 2 +- tests/research_test.py | 28 ++++++++++++++++++++++++---- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index 79f579602..164d4c2f9 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -21,10 +21,6 @@ async def conduct_research(self): """ # Reset visited_urls and source_urls at the start of each research task self.researcher.visited_urls.clear() - # Due to deprecation of report_type in favor of report_source, - # we need to clear source_urls if report_source is not static - if self.researcher.report_source != "static" and self.researcher.report_type != "sources": - self.researcher.source_urls = [] if self.researcher.verbose: await stream_output( @@ -40,7 +36,7 @@ async def conduct_research(self): # If specified, the researcher will use the given urls as the context for the research. if self.researcher.source_urls: self.context = await self.__get_context_by_urls(self.researcher.source_urls) - if len(self.context) == 0 and self.verbose: + if self.context and len(self.context) == 0 and self.verbose: # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge await stream_output( "logs", diff --git a/tests/report-types.py b/tests/report-types.py index 2a42aa3ff..44aff1006 100644 --- a/tests/report-types.py +++ b/tests/report-types.py @@ -1,7 +1,7 @@ import os import asyncio import pytest -from gpt_researcher import GPTResearcher +from gpt_researcher.agent import GPTResearcher # Define the report types to test report_types = [ diff --git a/tests/research_test.py b/tests/research_test.py index 595570273..8f95e3130 100644 --- a/tests/research_test.py +++ b/tests/research_test.py @@ -16,17 +16,37 @@ from gpt_researcher.agent import GPTResearcher # Ensure this path is correct import asyncio +import logging +from typing import List, Dict, Any + +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors async def get_report(query: str, report_type: str, sources: list) -> str: - researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=False) + custom_logs_handler = CustomLogsHandler() + researcher = GPTResearcher(query=query, + report_type=report_type, + add_additional_sources=False, + websocket=custom_logs_handler) await researcher.conduct_research() report = await researcher.write_report() return report, researcher if __name__ == "__main__": - query = "Research the latest advancements in AI and provide a detailed report in APA format including sources." - report_type = "sources" - sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related + query = "Write an analysis on paul graham" + report_type = "research_report" + sources = ["https://www.paulgraham.com/when.html", "https://www.paulgraham.com/noob.html"] # query is related report, researcher = asyncio.run(get_report(query, report_type, sources)) print(report) From 0d6ca2c9c62e96ef0c1205e4b187aee57821519e Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Tue, 12 Nov 2024 07:56:08 +0200 Subject: [PATCH 08/10] fix imports in tests --- tests/research_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/research_test.py b/tests/research_test.py index 8f95e3130..294933932 100644 --- a/tests/research_test.py +++ b/tests/research_test.py @@ -57,7 +57,7 @@ async def get_report(query: str, report_type: str, sources: list) -> str: #### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context) -# from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct # import asyncio # async def get_report(query: str, report_type: str, sources: list) -> str: @@ -80,7 +80,7 @@ async def get_report(query: str, report_type: str, sources: list) -> str: #### Test case 3 (Suggested solution - add_additional_sources parameter allows GPTR to scour more of the web and not restrict to source_urls) -# from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct # import asyncio # async def get_report(query: str, report_type: str, sources: list) -> str: @@ -103,7 +103,7 @@ async def get_report(query: str, report_type: str, sources: list) -> str: # #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the add_additional_sources parameter is set allowing for a larger research scope) -# from gpt_researcher.master.agent import GPTResearcher # Ensure this path is correct +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct # import asyncio # async def get_report(query: str, report_type: str, sources: list) -> str: From 7119920db26eec0beaa3b3776a43e38d3a6f1666 Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Tue, 12 Nov 2024 07:59:34 +0200 Subject: [PATCH 09/10] renamed add_additional_sources to complement_source_urls --- .../context/tailored-research.md | 4 ++-- gpt_researcher/agent.py | 4 ++-- gpt_researcher/skills/researcher.py | 4 ++-- tests/research_test.py | 18 +++++++++--------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/docs/gpt-researcher/context/tailored-research.md b/docs/docs/gpt-researcher/context/tailored-research.md index bed251cd6..05fa965fe 100644 --- a/docs/docs/gpt-researcher/context/tailored-research.md +++ b/docs/docs/gpt-researcher/context/tailored-research.md @@ -6,7 +6,7 @@ The GPT Researcher package allows you to tailor the research to your needs such You can specify the sources you want the GPT Researcher to research on by providing a list of URLs. The GPT Researcher will then conduct research on the provided sources via `source_urls`. -If you want GPT Researcher to perform additional research outside of the URLs you provided, i.e., conduct research on various other websites that it finds suitable for the query/sub-query, you can set the parameter `add_additional_sources` as `True`. Default value of `False` will only scour the websites you provide via `source_urls`. +If you want GPT Researcher to perform additional research outside of the URLs you provided, i.e., conduct research on various other websites that it finds suitable for the query/sub-query, you can set the parameter `complement_source_urls` as `True`. Default value of `False` will only scour the websites you provide via `source_urls`. ```python @@ -14,7 +14,7 @@ from gpt_researcher import GPTResearcher import asyncio async def get_report(query: str, report_type: str, sources: list) -> str: - researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=False) + researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=False) await researcher.conduct_research() report = await researcher.write_report() return report diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py index e8facf3ef..66b1bb97e 100644 --- a/gpt_researcher/agent.py +++ b/gpt_researcher/agent.py @@ -32,7 +32,7 @@ def __init__( report_source: str = ReportSource.Web.value, tone: Tone = Tone.Objective, source_urls=None, - add_additional_sources=False, + complement_source_urls=False, documents=None, vector_store=None, vector_store_filter=None, @@ -58,7 +58,7 @@ def __init__( self.max_subtopics = max_subtopics self.tone = tone if isinstance(tone, Tone) else Tone.Objective self.source_urls = source_urls - self.add_additional_sources: bool = add_additional_sources + self.complement_source_urls: bool = complement_source_urls self.research_sources = [] # The list of scraped sources including title, content and images self.research_images = [] # The list of selected research images self.documents = documents diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index 164d4c2f9..dc28c6736 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -44,8 +44,8 @@ async def conduct_research(self): f"🧐 I was unable to find relevant context in the provided sources...", self.websocket, ) - # If add_additional_sources parameter is set, more resources can be gathered to create additional context using default web search - if self.researcher.add_additional_sources: + # If complement_source_urls parameter is set, more resources can be gathered to create additional context using default web search + if self.researcher.complement_source_urls: additional_research = await self.__get_context_by_search(self.researcher.query) self.context += ' '.join(additional_research) diff --git a/tests/research_test.py b/tests/research_test.py index 294933932..b58d5b92a 100644 --- a/tests/research_test.py +++ b/tests/research_test.py @@ -1,8 +1,8 @@ """ -Hi! The following test cases are for the new parameter `add_additional_sources` and fix on the functional error with `source_urls` in GPTResearcher class. +Hi! The following test cases are for the new parameter `complement_source_urls` and fix on the functional error with `source_urls` in GPTResearcher class. The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced. -This parameter named will `add_additional_sources` allow GPTR to research on sources other than the provided sources via source_urls if set to True. +This parameter named will `complement_source_urls` allow GPTR to research on sources other than the provided sources via source_urls if set to True. Default is False, i.e., no additional research will be conducted on newer sources. """ @@ -37,7 +37,7 @@ async def get_report(query: str, report_type: str, sources: list) -> str: custom_logs_handler = CustomLogsHandler() researcher = GPTResearcher(query=query, report_type=report_type, - add_additional_sources=False, + complement_source_urls=False, websocket=custom_logs_handler) await researcher.conduct_research() report = await researcher.write_report() @@ -78,13 +78,13 @@ async def get_report(query: str, report_type: str, sources: list) -> str: -#### Test case 3 (Suggested solution - add_additional_sources parameter allows GPTR to scour more of the web and not restrict to source_urls) +#### Test case 3 (Suggested solution - complement_source_urls parameter allows GPTR to scour more of the web and not restrict to source_urls) # from gpt_researcher.agent import GPTResearcher # Ensure this path is correct # import asyncio # async def get_report(query: str, report_type: str, sources: list) -> str: -# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=True) +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) # await researcher.conduct_research() # report = await researcher.write_report() # return report, researcher @@ -97,17 +97,17 @@ async def get_report(query: str, report_type: str, sources: list) -> str: # report, researcher = asyncio.run(get_report(query, report_type, sources)) # print(report) -# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the add_additional_sources is set which should make gptr do default web search to gather contexts +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the complement_source_urls is set which should make gptr do default web search to gather contexts -# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the add_additional_sources parameter is set allowing for a larger research scope) +# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the complement_source_urls parameter is set allowing for a larger research scope) # from gpt_researcher.agent import GPTResearcher # Ensure this path is correct # import asyncio # async def get_report(query: str, report_type: str, sources: list) -> str: -# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, add_additional_sources=True) +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) # await researcher.conduct_research() # report = await researcher.write_report() # return report, researcher @@ -120,4 +120,4 @@ async def get_report(query: str, report_type: str, sources: list) -> str: # report, researcher = asyncio.run(get_report(query, report_type, sources)) # print(report) -# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the add_additional_sources is set which should make gptr do default web search to gather more contexts! +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the complement_source_urls is set which should make gptr do default web search to gather more contexts! From c083e65f0fc78ef9d299cfd8731589975248f321 Mon Sep 17 00:00:00 2001 From: ElishaKay Date: Tue, 12 Nov 2024 14:13:55 +0200 Subject: [PATCH 10/10] update import statement of tests --- tests/report-types.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/report-types.py b/tests/report-types.py index 44aff1006..073f8336e 100644 --- a/tests/report-types.py +++ b/tests/report-types.py @@ -2,6 +2,22 @@ import asyncio import pytest from gpt_researcher.agent import GPTResearcher +import logging +from typing import List, Dict, Any + +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors # Define the report types to test report_types = [ @@ -22,9 +38,10 @@ async def test_gpt_researcher(report_type): # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir) - + + custom_logs_handler = CustomLogsHandler() # Create an instance of GPTResearcher - researcher = GPTResearcher(query=query, report_type=report_type) + researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler) # Conduct research and write the report await researcher.conduct_research()