From be8c6dd45dda97ac2e2451de39d77c29ab210dd1 Mon Sep 17 00:00:00 2001 From: assafelovic Date: Sun, 16 Jun 2024 11:26:11 +0300 Subject: [PATCH] added implementation for custom retriever --- docs/docs/gpt-researcher/retrievers.md | 56 ++++++++++++++++++++++ docs/sidebars.js | 1 + gpt_researcher/master/actions.py | 3 ++ gpt_researcher/retrievers/__init__.py | 2 + gpt_researcher/retrievers/custom/custom.py | 52 ++++++++++++++++++++ 5 files changed, 114 insertions(+) create mode 100644 docs/docs/gpt-researcher/retrievers.md create mode 100644 gpt_researcher/retrievers/custom/custom.py diff --git a/docs/docs/gpt-researcher/retrievers.md b/docs/docs/gpt-researcher/retrievers.md new file mode 100644 index 000000000..9ae648ee0 --- /dev/null +++ b/docs/docs/gpt-researcher/retrievers.md @@ -0,0 +1,56 @@ +# Retrievers + +Retrievers are search engines used to find the most relevant documents for a given research task. +You can specify your preferred web search or use any custom retriever of your choice. + +## Web Search Engines +GPT Researcher defaults to using the [Tavily](https://app.tavily.com) search engine for retrieving search results. +But you can also use other search engines by specifying the `RETRIEVER` env var. Please note that each search engine has its own API Key requirements and usage limits. + +For example: +```bash +RETRIEVER=bing +``` + +Thanks to our community, we have integrated the following web search engines: +- [Tavily](https://app.tavily.com) - Default +- [Bing](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) - Env: `RETRIEVER=bing` +- [Google](https://developers.google.com/custom-search/v1/overview) - Env: `RETRIEVER=google` +- [Serp API](https://serpapi.com/) - Env: `RETRIEVER=serpapi` +- [Serper](https://serper.dev/) - Env: `RETRIEVER=serper` +- [Searx](https://searx.github.io/searx/) - Env: `RETRIEVER=searx` +- [Duckduckgo](https://pypi.org/project/duckduckgo-search/) - Env: `RETRIEVER=duckduckgo` + +## Custom Retrievers +You can also use any custom retriever of your choice by specifying the `RETRIEVER=custom` env var. +Custom retrievers allow you to use any search engine that provides an API to retrieve documents and is widely used for enterprise research tasks. + +In addition to setting the `RETRIEVER` env, you also need to set the following env vars: +- `RETRIEVER_ENDPOINT`: The endpoint URL of the custom retriever. +- Additional arguments required by the retriever should be prefixed with `RETRIEVER_ARG_` (e.g., RETRIEVER_ARG_API_KEY). + +### Example +```bash +RETRIEVER=custom +RETRIEVER_ENDPOINT=https://api.myretriever.com +RETRIEVER_ARG_API_KEY=YOUR_API_KEY +``` + +### Response Format +For the custom retriever to work correctly, the response from the endpoint should be in the following format: +```json +[ + { + "url": "http://example.com/page1", + "raw_content": "Content of page 1" + }, + { + "url": "http://example.com/page2", + "raw_content": "Content of page 2" + } +] +``` + +The system assumes this response format and processes the list of sources accordingly. + +Missing a retriever? Feel free to contribute to this project by submitting issues or pull requests on our [GitHub](https://github.com/assafelovic/gpt-researcher) page. \ No newline at end of file diff --git a/docs/sidebars.js b/docs/sidebars.js index 780c1cc50..da8a3f250 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -33,6 +33,7 @@ items: [ 'gpt-researcher/config', 'gpt-researcher/tailored-research', + 'gpt-researcher/retrievers', 'gpt-researcher/llms', ] }, diff --git a/gpt_researcher/master/actions.py b/gpt_researcher/master/actions.py index b9f854862..0f74a3e14 100644 --- a/gpt_researcher/master/actions.py +++ b/gpt_researcher/master/actions.py @@ -41,6 +41,9 @@ def get_retriever(retriever): case "tavily": from gpt_researcher.retrievers import TavilySearch retriever = TavilySearch + case "custom": + from gpt_researcher.retrievers import CustomRetriever + retriever = CustomRetriever case _: raise Exception("Retriever not found.") diff --git a/gpt_researcher/retrievers/__init__.py b/gpt_researcher/retrievers/__init__.py index e8b78ceb3..a61a0ab29 100644 --- a/gpt_researcher/retrievers/__init__.py +++ b/gpt_researcher/retrievers/__init__.py @@ -5,9 +5,11 @@ from .serpapi.serpapi import SerpApiSearch from .searx.searx import SearxSearch from .bing.bing import BingSearch +from .custom.custom import CustomRetriever __all__ = [ "TavilySearch", + "CustomRetriever", "Duckduckgo", "SerperSearch", "SerpApiSearch", diff --git a/gpt_researcher/retrievers/custom/custom.py b/gpt_researcher/retrievers/custom/custom.py new file mode 100644 index 000000000..c4a4ed90b --- /dev/null +++ b/gpt_researcher/retrievers/custom/custom.py @@ -0,0 +1,52 @@ +from typing import Any, Dict, List, Optional +import requests +import os + + +class CustomRetriever: + """ + Custom API Retriever + """ + + def __init__(self, query: str): + self.endpoint = os.getenv('RETRIEVER_ENDPOINT') + if not self.endpoint: + raise ValueError("RETRIEVER_ENDPOINT environment variable not set") + + self.params = self._populate_params() + self.query = query + + def _populate_params(self) -> Dict[str, Any]: + """ + Populates parameters from environment variables prefixed with 'RETRIEVER_ARG_' + """ + return { + key[len('RETRIEVER_ARG_'):].lower(): value + for key, value in os.environ.items() + if key.startswith('RETRIEVER_ARG_') + } + + def search(self, max_results: int = 5) -> Optional[List[Dict[str, Any]]]: + """ + Performs the search using the custom retriever endpoint. + + :param max_results: Maximum number of results to return (not currently used) + :return: JSON response in the format: + [ + { + "url": "http://example.com/page1", + "raw_content": "Content of page 1" + }, + { + "url": "http://example.com/page2", + "raw_content": "Content of page 2" + } + ] + """ + try: + response = requests.get(self.endpoint, params={**self.params, 'query': self.query}) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + print(f"Failed to retrieve search results: {e}") + return None \ No newline at end of file