Merge pull request #602 from assafelovic/feature/custom_retriever

added implementation for custom retriever
assafelovic · Jun 16, 2024 · 686bc9c · 686bc9c
2 parents 4761079 + be8c6dd
commit 686bc9c
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 0 deletions.
diff --git a/docs/docs/gpt-researcher/retrievers.md b/docs/docs/gpt-researcher/retrievers.md
@@ -0,0 +1,56 @@
+# Retrievers
+
+Retrievers are search engines used to find the most relevant documents for a given research task.
+You can specify your preferred web search or use any custom retriever of your choice.
+
+## Web Search Engines
+GPT Researcher defaults to using the [Tavily](https://app.tavily.com) search engine for retrieving search results. 
+But you can also use other search engines by specifying the `RETRIEVER` env var. Please note that each search engine has its own API Key requirements and usage limits.
+
+For example:
+```bash
+RETRIEVER=bing
+```
+
+Thanks to our community, we have integrated the following web search engines:
+- [Tavily](https://app.tavily.com) - Default
+- [Bing](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) - Env: `RETRIEVER=bing`
+- [Google](https://developers.google.com/custom-search/v1/overview) - Env: `RETRIEVER=google`
+- [Serp API](https://serpapi.com/) - Env: `RETRIEVER=serpapi`
+- [Serper](https://serper.dev/) - Env: `RETRIEVER=serper`
+- [Searx](https://searx.github.io/searx/) - Env: `RETRIEVER=searx`
+- [Duckduckgo](https://pypi.org/project/duckduckgo-search/) - Env: `RETRIEVER=duckduckgo`
+
+## Custom Retrievers
+You can also use any custom retriever of your choice by specifying the `RETRIEVER=custom` env var.
+Custom retrievers allow you to use any search engine that provides an API to retrieve documents and is widely used for enterprise research tasks.
+
+In addition to setting the `RETRIEVER` env, you also need to set the following env vars:
+- `RETRIEVER_ENDPOINT`: The endpoint URL of the custom retriever.
+- Additional arguments required by the retriever should be prefixed with `RETRIEVER_ARG_` (e.g., RETRIEVER_ARG_API_KEY).
+
+### Example
+```bash
+RETRIEVER=custom
+RETRIEVER_ENDPOINT=https://api.myretriever.com
+RETRIEVER_ARG_API_KEY=YOUR_API_KEY
+```
+
+### Response Format
+For the custom retriever to work correctly, the response from the endpoint should be in the following format:
+```json
+[
+  {
+    "url": "http://example.com/page1",
+    "raw_content": "Content of page 1"
+  },
+  {
+    "url": "http://example.com/page2",
+    "raw_content": "Content of page 2"
+  }
+]
+```
+
+The system assumes this response format and processes the list of sources accordingly.
+
+Missing a retriever? Feel free to contribute to this project by submitting issues or pull requests on our [GitHub](https://github.com/assafelovic/gpt-researcher) page.
diff --git a/docs/sidebars.js b/docs/sidebars.js
@@ -33,6 +33,7 @@
       items: [
         'gpt-researcher/config',
         'gpt-researcher/tailored-research',
+        'gpt-researcher/retrievers',
         'gpt-researcher/llms',
         ]
     },

diff --git a/gpt_researcher/master/actions.py b/gpt_researcher/master/actions.py
@@ -41,6 +41,9 @@ def get_retriever(retriever):
         case "tavily":
             from gpt_researcher.retrievers import TavilySearch
             retriever = TavilySearch
+        case "custom":
+            from gpt_researcher.retrievers import CustomRetriever
+            retriever = CustomRetriever
 
         case _:
             raise Exception("Retriever not found.")

diff --git a/gpt_researcher/retrievers/__init__.py b/gpt_researcher/retrievers/__init__.py
@@ -5,9 +5,11 @@
 from .serpapi.serpapi import SerpApiSearch
 from .searx.searx import SearxSearch
 from .bing.bing import BingSearch
+from .custom.custom import CustomRetriever
 
 __all__ = [
     "TavilySearch",
+    "CustomRetriever",
     "Duckduckgo",
     "SerperSearch",
     "SerpApiSearch",

diff --git a/gpt_researcher/retrievers/custom/custom.py b/gpt_researcher/retrievers/custom/custom.py
@@ -0,0 +1,52 @@
+from typing import Any, Dict, List, Optional
+import requests
+import os
+
+
+class CustomRetriever:
+    """
+    Custom API Retriever
+    """
+
+    def __init__(self, query: str):
+        self.endpoint = os.getenv('RETRIEVER_ENDPOINT')
+        if not self.endpoint:
+            raise ValueError("RETRIEVER_ENDPOINT environment variable not set")
+
+        self.params = self._populate_params()
+        self.query = query
+
+    def _populate_params(self) -> Dict[str, Any]:
+        """
+        Populates parameters from environment variables prefixed with 'RETRIEVER_ARG_'
+        """
+        return {
+            key[len('RETRIEVER_ARG_'):].lower(): value
+            for key, value in os.environ.items()
+            if key.startswith('RETRIEVER_ARG_')
+        }
+
+    def search(self, max_results: int = 5) -> Optional[List[Dict[str, Any]]]:
+        """
+        Performs the search using the custom retriever endpoint.
+
+        :param max_results: Maximum number of results to return (not currently used)
+        :return: JSON response in the format:
+            [
+              {
+                "url": "http://example.com/page1",
+                "raw_content": "Content of page 1"
+              },
+              {
+                "url": "http://example.com/page2",
+                "raw_content": "Content of page 2"
+              }
+            ]
+        """
+        try:
+            response = requests.get(self.endpoint, params={**self.params, 'query': self.query})
+            response.raise_for_status()
+            return response.json()
+        except requests.RequestException as e:
+            print(f"Failed to retrieve search results: {e}")
+            return None