diff --git a/docs/docs/gpt-researcher/context/tailored-research.md b/docs/docs/gpt-researcher/context/tailored-research.md index d6972652d..05fa965fe 100644 --- a/docs/docs/gpt-researcher/context/tailored-research.md +++ b/docs/docs/gpt-researcher/context/tailored-research.md @@ -1,20 +1,23 @@ # Tailored Research -The GPT Researcher package allows you to tailor the research to your needs such as researching on specific sources or local documents, and even specify the agent prompt instruction upon which the research is conducted. +The GPT Researcher package allows you to tailor the research to your needs such as researching on specific sources (URLs) or local documents, and even specify the agent prompt instruction upon which the research is conducted. ### Research on Specific Sources 📚 -You can specify the sources you want the GPT Researcher to research on by providing a list of URLs. GPT Researcher will then conduct research on the provided sources only. -Simply pass the sources as the `source_urls` argument to the `GPTResearcher` class and the "static" `report_source`. +You can specify the sources you want the GPT Researcher to research on by providing a list of URLs. The GPT Researcher will then conduct research on the provided sources via `source_urls`. + +If you want GPT Researcher to perform additional research outside of the URLs you provided, i.e., conduct research on various other websites that it finds suitable for the query/sub-query, you can set the parameter `complement_source_urls` as `True`. Default value of `False` will only scour the websites you provide via `source_urls`. + ```python from gpt_researcher import GPTResearcher import asyncio -async def get_report(query: str, report_source: str, sources: list) -> str: - researcher = GPTResearcher(query=query, report_source=report_source, source_urls=sources) - research_context = await researcher.conduct_research() - return await researcher.write_report() +async def get_report(query: str, report_type: str, sources: list) -> str: + researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=False) + await researcher.conduct_research() + report = await researcher.write_report() + return report if __name__ == "__main__": query = "What are the biggest trends in AI lately?" diff --git a/gpt_researcher/agent.py b/gpt_researcher/agent.py index 642afe62f..66b1bb97e 100644 --- a/gpt_researcher/agent.py +++ b/gpt_researcher/agent.py @@ -32,6 +32,7 @@ def __init__( report_source: str = ReportSource.Web.value, tone: Tone = Tone.Objective, source_urls=None, + complement_source_urls=False, documents=None, vector_store=None, vector_store_filter=None, @@ -57,6 +58,7 @@ def __init__( self.max_subtopics = max_subtopics self.tone = tone if isinstance(tone, Tone) else Tone.Objective self.source_urls = source_urls + self.complement_source_urls: bool = complement_source_urls self.research_sources = [] # The list of scraped sources including title, content and images self.research_images = [] # The list of selected research images self.documents = documents diff --git a/gpt_researcher/skills/researcher.py b/gpt_researcher/skills/researcher.py index 948f57329..dc28c6736 100644 --- a/gpt_researcher/skills/researcher.py +++ b/gpt_researcher/skills/researcher.py @@ -21,10 +21,6 @@ async def conduct_research(self): """ # Reset visited_urls and source_urls at the start of each research task self.researcher.visited_urls.clear() - # Due to deprecation of report_type in favor of report_source, - # we need to clear source_urls if report_source is not static - if self.researcher.report_source != "static" and self.researcher.report_type != "sources": - self.researcher.source_urls = [] if self.researcher.verbose: await stream_output( @@ -39,7 +35,19 @@ async def conduct_research(self): # If specified, the researcher will use the given urls as the context for the research. if self.researcher.source_urls: - self.researcher.context = await self.__get_context_by_urls(self.researcher.source_urls) + self.context = await self.__get_context_by_urls(self.researcher.source_urls) + if self.context and len(self.context) == 0 and self.verbose: + # Could not find any relevant resources in source_urls to answer the query or sub-query. Will answer using model's inherent knowledge + await stream_output( + "logs", + "answering_from_memory", + f"🧐 I was unable to find relevant context in the provided sources...", + self.websocket, + ) + # If complement_source_urls parameter is set, more resources can be gathered to create additional context using default web search + if self.researcher.complement_source_urls: + additional_research = await self.__get_context_by_search(self.researcher.query) + self.context += ' '.join(additional_research) elif self.researcher.report_source == ReportSource.Local.value: document_data = await DocumentLoader(self.researcher.cfg.doc_path).load() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/report-types.py b/tests/report-types.py index 2a42aa3ff..073f8336e 100644 --- a/tests/report-types.py +++ b/tests/report-types.py @@ -1,7 +1,23 @@ import os import asyncio import pytest -from gpt_researcher import GPTResearcher +from gpt_researcher.agent import GPTResearcher +import logging +from typing import List, Dict, Any + +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors # Define the report types to test report_types = [ @@ -22,9 +38,10 @@ async def test_gpt_researcher(report_type): # Ensure the output directory exists if not os.path.exists(output_dir): os.makedirs(output_dir) - + + custom_logs_handler = CustomLogsHandler() # Create an instance of GPTResearcher - researcher = GPTResearcher(query=query, report_type=report_type) + researcher = GPTResearcher(query=query, report_type=report_type, websocket=custom_logs_handler) # Conduct research and write the report await researcher.conduct_research() diff --git a/tests/research_test.py b/tests/research_test.py new file mode 100644 index 000000000..b58d5b92a --- /dev/null +++ b/tests/research_test.py @@ -0,0 +1,123 @@ +""" +Hi! The following test cases are for the new parameter `complement_source_urls` and fix on the functional error with `source_urls` in GPTResearcher class. + +The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced. +This parameter named will `complement_source_urls` allow GPTR to research on sources other than the provided sources via source_urls if set to True. +Default is False, i.e., no additional research will be conducted on newer sources. +""" + +## Notes: +## Please uncomment the test case to run and comment the rest. +## Thanks! + + + +#### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research) + +from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +import asyncio +import logging +from typing import List, Dict, Any + +class CustomLogsHandler: + """A custom Logs handler class to handle JSON data.""" + def __init__(self): + self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data + logging.basicConfig(level=logging.INFO) # Set up logging configuration + + async def send_json(self, data: Dict[str, Any]) -> None: + """Send JSON data and log it, with error handling.""" + try: + self.logs.append(data) # Append data to logs + logging.info(f"My custom Log: {data}") # Use logging instead of print + except Exception as e: + logging.error(f"Error logging data: {e}") # Log any errors + +async def get_report(query: str, report_type: str, sources: list) -> str: + custom_logs_handler = CustomLogsHandler() + researcher = GPTResearcher(query=query, + report_type=report_type, + complement_source_urls=False, + websocket=custom_logs_handler) + await researcher.conduct_research() + report = await researcher.write_report() + return report, researcher + +if __name__ == "__main__": + query = "Write an analysis on paul graham" + report_type = "research_report" + sources = ["https://www.paulgraham.com/when.html", "https://www.paulgraham.com/noob.html"] # query is related + + report, researcher = asyncio.run(get_report(query, report_type, sources)) + print(report) + + print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, so there will be relevant context present + + + +#### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context) + +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What is Microsoft's business model?" +# report_type = "research_report" +# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED. + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say 0 (zero) value because the query is UNRELATED to the contents of the pages, so there will be NO relevant context present + + + +#### Test case 3 (Suggested solution - complement_source_urls parameter allows GPTR to scour more of the web and not restrict to source_urls) + +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What is Microsoft's business model?" +# report_type = "research_report" +# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the complement_source_urls is set which should make gptr do default web search to gather contexts + + + +# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the complement_source_urls parameter is set allowing for a larger research scope) + +# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct +# import asyncio + +# async def get_report(query: str, report_type: str, sources: list) -> str: +# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) +# await researcher.conduct_research() +# report = await researcher.write_report() +# return report, researcher + +# if __name__ == "__main__": +# query = "What are the latest advancements in AI?" +# report_type = "research_report" +# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related + +# report, researcher = asyncio.run(get_report(query, report_type, sources)) +# print(report) + +# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the complement_source_urls is set which should make gptr do default web search to gather more contexts!