-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #982 from ElishaKay/source_url_with_additional_res…
…earch fix: passing source_urls limits sources
- Loading branch information
Showing
6 changed files
with
168 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
""" | ||
Hi! The following test cases are for the new parameter `complement_source_urls` and fix on the functional error with `source_urls` in GPTResearcher class. | ||
The source_urls parameter was resetting each time in conduct_research function causing gptr to forget the given links. Now, that has been fixed and a new parameter is introduced. | ||
This parameter named will `complement_source_urls` allow GPTR to research on sources other than the provided sources via source_urls if set to True. | ||
Default is False, i.e., no additional research will be conducted on newer sources. | ||
""" | ||
|
||
## Notes: | ||
## Please uncomment the test case to run and comment the rest. | ||
## Thanks! | ||
|
||
|
||
|
||
#### Test case 1 (original test case as control from https://docs.gptr.dev/docs/gpt-researcher/tailored-research) | ||
|
||
from gpt_researcher.agent import GPTResearcher # Ensure this path is correct | ||
import asyncio | ||
import logging | ||
from typing import List, Dict, Any | ||
|
||
class CustomLogsHandler: | ||
"""A custom Logs handler class to handle JSON data.""" | ||
def __init__(self): | ||
self.logs: List[Dict[str, Any]] = [] # Initialize logs to store data | ||
logging.basicConfig(level=logging.INFO) # Set up logging configuration | ||
|
||
async def send_json(self, data: Dict[str, Any]) -> None: | ||
"""Send JSON data and log it, with error handling.""" | ||
try: | ||
self.logs.append(data) # Append data to logs | ||
logging.info(f"My custom Log: {data}") # Use logging instead of print | ||
except Exception as e: | ||
logging.error(f"Error logging data: {e}") # Log any errors | ||
|
||
async def get_report(query: str, report_type: str, sources: list) -> str: | ||
custom_logs_handler = CustomLogsHandler() | ||
researcher = GPTResearcher(query=query, | ||
report_type=report_type, | ||
complement_source_urls=False, | ||
websocket=custom_logs_handler) | ||
await researcher.conduct_research() | ||
report = await researcher.write_report() | ||
return report, researcher | ||
|
||
if __name__ == "__main__": | ||
query = "Write an analysis on paul graham" | ||
report_type = "research_report" | ||
sources = ["https://www.paulgraham.com/when.html", "https://www.paulgraham.com/noob.html"] # query is related | ||
|
||
report, researcher = asyncio.run(get_report(query, report_type, sources)) | ||
print(report) | ||
|
||
print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, so there will be relevant context present | ||
|
||
|
||
|
||
#### Test case 2 (Illustrating the problem, i.e., source_urls are not scoured. Hence, no relevant context) | ||
|
||
# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct | ||
# import asyncio | ||
|
||
# async def get_report(query: str, report_type: str, sources: list) -> str: | ||
# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources) | ||
# await researcher.conduct_research() | ||
# report = await researcher.write_report() | ||
# return report, researcher | ||
|
||
# if __name__ == "__main__": | ||
# query = "What is Microsoft's business model?" | ||
# report_type = "research_report" | ||
# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED. | ||
|
||
# report, researcher = asyncio.run(get_report(query, report_type, sources)) | ||
# print(report) | ||
|
||
# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say 0 (zero) value because the query is UNRELATED to the contents of the pages, so there will be NO relevant context present | ||
|
||
|
||
|
||
#### Test case 3 (Suggested solution - complement_source_urls parameter allows GPTR to scour more of the web and not restrict to source_urls) | ||
|
||
# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct | ||
# import asyncio | ||
|
||
# async def get_report(query: str, report_type: str, sources: list) -> str: | ||
# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) | ||
# await researcher.conduct_research() | ||
# report = await researcher.write_report() | ||
# return report, researcher | ||
|
||
# if __name__ == "__main__": | ||
# query = "What is Microsoft's business model?" | ||
# report_type = "research_report" | ||
# sources = ["https://www.apple.com", "https://en.wikipedia.org/wiki/Olympic_Games"] # query is UNRELATED | ||
|
||
# report, researcher = asyncio.run(get_report(query, report_type, sources)) | ||
# print(report) | ||
|
||
# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is UNRELATED to the contents of the page, but the complement_source_urls is set which should make gptr do default web search to gather contexts | ||
|
||
|
||
|
||
# #### Test case 4 (Furthermore, GPTR will create more context in addition to source_urls if the complement_source_urls parameter is set allowing for a larger research scope) | ||
|
||
# from gpt_researcher.agent import GPTResearcher # Ensure this path is correct | ||
# import asyncio | ||
|
||
# async def get_report(query: str, report_type: str, sources: list) -> str: | ||
# researcher = GPTResearcher(query=query, report_type=report_type, source_urls=sources, complement_source_urls=True) | ||
# await researcher.conduct_research() | ||
# report = await researcher.write_report() | ||
# return report, researcher | ||
|
||
# if __name__ == "__main__": | ||
# query = "What are the latest advancements in AI?" | ||
# report_type = "research_report" | ||
# sources = ["https://en.wikipedia.org/wiki/Artificial_intelligence", "https://www.ibm.com/watson/ai"] # query is related | ||
|
||
# report, researcher = asyncio.run(get_report(query, report_type, sources)) | ||
# print(report) | ||
|
||
# print(f"\nLength of the context = {len(researcher.get_research_context())}") # Must say Non-zero value because the query is related to the contents of the page, and additionally the complement_source_urls is set which should make gptr do default web search to gather more contexts! |