diff --git a/.env.example b/.env.example index da7aa521c..11e760346 100644 --- a/.env.example +++ b/.env.example @@ -1,9 +1,38 @@ +# .env.example + +# OpenAI API key for accessing OpenAI's GPT models OPENAI_API_KEY= + +# API key for accessing Tavily's services TAVILY_API_KEY= + +# API key for accessing LangChain's services LANGCHAIN_API_KEY= + +# Path to the directory where documents are stored DOC_PATH=./my-docs -# the name of the embedding model to use for Ollama +# The name of the embedding model to use for Ollama OLLAMA_EMBEDDING_MODEL= -# the Ollama endpoint to use -OLLAMA_BASE_URL= \ No newline at end of file + +# The Ollama endpoint to use +OLLAMA_BASE_URL= + +# Choose one of the available retrievers by uncommenting the desired retriever: +# RETRIEVER=arxiv +# RETRIEVER=bing +# RETRIEVER=custom +# RETRIEVER=duckduckgo +# RETRIEVER=exa +# RETRIEVER=google +# RETRIEVER=searx +# RETRIEVER=semantic_scholar +# RETRIEVER=serpapi +# RETRIEVER=serper +# RETRIEVER=tavily + +# Example setting for retriever +RETRIEVER=tavily + +# Maximum number of search results to return per query +MAX_SEARCH_RESULTS_PER_QUERY=5 diff --git a/gpt_researcher/master/actions.py b/gpt_researcher/master/actions.py index ddcdf860a..cfa672d1c 100644 --- a/gpt_researcher/master/actions.py +++ b/gpt_researcher/master/actions.py @@ -58,6 +58,10 @@ def get_retriever(retriever): from gpt_researcher.retrievers import ExaSearch retriever = ExaSearch + case "semantic_scholar": + from gpt_researcher.retrievers import SemanticScholarSearch + + retriever = SemanticScholarSearch case "custom": from gpt_researcher.retrievers import CustomRetriever diff --git a/gpt_researcher/retrievers/__init__.py b/gpt_researcher/retrievers/__init__.py index 60fdc8e11..62954ae80 100644 --- a/gpt_researcher/retrievers/__init__.py +++ b/gpt_researcher/retrievers/__init__.py @@ -7,6 +7,7 @@ from .serpapi.serpapi import SerpApiSearch from .serper.serper import SerperSearch from .tavily.tavily_search import TavilySearch +from .semantic_scholar.semantic_scholar import SemanticScholarSearch __all__ = [ "TavilySearch", @@ -17,5 +18,6 @@ "GoogleSearch", "SearxSearch", "BingSearch", - "ArxivSearch" + "ArxivSearch", + "SemanticScholarSearch", ] diff --git a/gpt_researcher/retrievers/semantic_scholar/__init__.py b/gpt_researcher/retrievers/semantic_scholar/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py b/gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py new file mode 100644 index 000000000..db1dcf106 --- /dev/null +++ b/gpt_researcher/retrievers/semantic_scholar/semantic_scholar.py @@ -0,0 +1,59 @@ +from typing import Dict, List + +import requests + + +class SemanticScholarSearch: + """ + Semantic Scholar API Retriever + """ + + BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search" + VALID_SORT_CRITERIA = ["relevance", "citationCount", "publicationDate"] + + def __init__(self, query: str, sort: str = "relevance"): + """ + Initialize the SemanticScholarSearch class with a query and sort criterion. + + :param query: Search query string + :param sort: Sort criterion ('relevance', 'citationCount', 'publicationDate') + """ + self.query = query + assert sort in self.VALID_SORT_CRITERIA, "Invalid sort criterion" + self.sort = sort.lower() + + def search(self, max_results: int = 20) -> List[Dict[str, str]]: + """ + Perform the search on Semantic Scholar and return results. + + :param max_results: Maximum number of results to retrieve + :return: List of dictionaries containing title, href, and body of each paper + """ + params = { + "query": self.query, + "limit": max_results, + "fields": "title,abstract,url,venue,year,authors,isOpenAccess,openAccessPdf", + "sort": self.sort, + } + + try: + response = requests.get(self.BASE_URL, params=params) + response.raise_for_status() + except requests.RequestException as e: + print(f"An error occurred while accessing Semantic Scholar API: {e}") + return [] + + results = response.json().get("data", []) + search_result = [] + + for result in results: + if result.get("isOpenAccess") and result.get("openAccessPdf"): + search_result.append( + { + "title": result.get("title", "No Title"), + "href": result["openAccessPdf"].get("url", "No URL"), + "body": result.get("abstract", "Abstract not available"), + } + ) + + return search_result