Add two new agents into the benchmark: Autonolas with OpenAI embeddin…

…gs and Autonolas with Rephrasing questions (#5) * Evaluate question before researching it * fix var name * Add token probability for `yes` and `no` to the benchmark * fix types * Add two new agents into the benchmark: Autonolas with OpenAI embeddings and Autonolas with rephrasing questions * fix * fix * fix * fix * remove unused vars * rename * Review comments * rabbit review * remove bad merge * rabbit review * fix bad conflict
agentcoinorg · Feb 7, 2024 · 4de102b · 4de102b
1 parent ffd9575
commit 4de102b
Show file tree

Hide file tree

Showing 7 changed files with 935 additions and 20 deletions.
diff --git a/evo_researcher/autonolas/research.py b/evo_researcher/autonolas/research.py
@@ -1,6 +1,7 @@
 
 import os
 import math
+from sklearn.metrics.pairwise import cosine_similarity
 from typing import Any, Dict, Generator, List, Optional, Tuple, TypedDict
 from datetime import datetime, timezone
 import json
@@ -9,7 +10,7 @@
 from concurrent.futures import Future, ThreadPoolExecutor
 from itertools import groupby
 from operator import itemgetter
-
+from enum import Enum
 from bs4 import BeautifulSoup, NavigableString
 from googleapiclient.discovery import build
 
@@ -22,8 +23,11 @@
 from langchain_community.chat_models import ChatOpenAI
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema.output_parser import StrOutputParser
+from langchain.embeddings.openai import OpenAIEmbeddings
 
 from dateutil import parser
+from evo_researcher.functions.cache import persistent_inmemory_cache
+from evo_researcher.functions.parallelism import par_map
 
 load_dotenv()
 
@@ -295,6 +299,12 @@
     "link",
 ]
 
+
+class EmbeddingModel(Enum):
+    spacy = "spacy"
+    openai = "openai"
+
+
 class Prediction(TypedDict):
     decision: Optional[str]
     decision_token_prob: Optional[float]
@@ -316,7 +326,10 @@ def search_google(query: str, api_key: str, engine: str, num: int = 3) -> List[s
         )
         .execute()
     )
-    return [result["link"] for result in search["items"]]
+    try:
+        return [result["link"] for result in search["items"]]
+    except KeyError as e:
+        raise ValueError(f"Can not parse results: {search}") from e
 
 
 def download_spacy_model(model_name: str) -> None:
@@ -627,12 +640,19 @@ def concatenate_short_sentences(sentences, len_sentence_threshold):
     return modified_sentences
 
 
+@persistent_inmemory_cache
+def openai_embedding_cached(text: str, model: str = "text-embedding-ada-002") -> list[float]:
+    emb = OpenAIEmbeddings(model=model)
+    return emb.embed_query(text)
+
+
 def extract_similarity_scores(
     text: str,
-    query_emb,
+    doc_question,
     event_date: str,
     nlp,
     date: str,
+    embedding_model: EmbeddingModel,
 ) -> List[Tuple[str, float, str]]:
     """
     Extract relevant information from website text based on a given event question.
@@ -690,13 +710,18 @@ def extract_similarity_scores(
     # Limit the number of sentences for performance optimization
     sentences = sentences[:num_sentences_threshold]
 
-    similarities = []
-
-    # Encode sentences using spaCy model
-    for i, sentence in enumerate(sentences):
-        doc_sentence = nlp(sentence)
-        similarity_score = query_emb.similarity(doc_sentence)
-        similarities.append(similarity_score)
+    # Encode sentences using an embedding model
+    similarities = par_map(
+        sentences, 
+        lambda sentence: (
+            doc_question.similarity(nlp(sentence)) if embedding_model == EmbeddingModel.spacy 
+            else cosine_similarity(
+                [openai_embedding_cached(sentence)], 
+                [openai_embedding_cached(doc_question.text)]
+            )[0][0] if embedding_model == EmbeddingModel.openai 
+            else None
+        )
+    )
 
     # Create tuples and store them in a list
     sentence_similarity_date_tuples = [
@@ -752,9 +777,10 @@ def get_date(soup):
 
 def extract_sentences(
     html: str,
-    query_emb,
+    doc_question,
     event_date: str,
     nlp,
+    embedding_model: EmbeddingModel,
 ) -> List[Tuple[str, float, str]]:
     """
     Extract relevant information from HTML string.
@@ -796,10 +822,11 @@ def extract_sentences(
     # Get List of (sentence, similarity, date) tuples
     similarity_scores = extract_similarity_scores(
         text=text,
-        query_emb=query_emb,
+        doc_question=doc_question,
         event_date=event_date,
         nlp=nlp,
         date=date,
+        embedding_model=embedding_model,
     )
 
     if not similarity_scores:
@@ -883,6 +910,7 @@ def extract_and_sort_sentences(
     urls: List[str],
     event_question: str,
     nlp,
+    embedding_model: EmbeddingModel,
 ) -> List[Tuple[str, float, str]]:
     """
     Extract texts from a list of URLs using Spacy models.
@@ -906,9 +934,6 @@ def extract_and_sort_sentences(
     doc_question = nlp(event_question)
     event_date = extract_event_date(doc_question)
 
-    # Create embedding for event question with Spacy embedder model
-    query_emb = nlp(event_question)
-
     if event_date is None:
         print(
             f"Could not extract precise event date from event question: {event_question}"
@@ -925,9 +950,10 @@ def extract_and_sort_sentences(
                 # Extract relevant information for the event question
                 extracted_sentences = extract_sentences(
                     html=result.text,
-                    query_emb=query_emb,
+                    doc_question=doc_question,
                     event_date=event_date,
                     nlp=nlp,
+                    embedding_model=embedding_model,
                 )
 
                 # Delete the result object to free memory
@@ -1002,6 +1028,7 @@ def fetch_additional_information(
     google_api_key: str,
     google_engine: str,
     nlp,
+    embedding_model: EmbeddingModel,
     engine: str = "gpt-3.5-turbo",
     temperature: float = 0.5,
     max_compl_tokens: int = 500,
@@ -1072,6 +1099,7 @@ def fetch_additional_information(
         urls=urls,
         event_question=event_question,
         nlp=nlp,
+        embedding_model=embedding_model,
     )
 
     # Join the sorted sentences and group them by date
@@ -1086,6 +1114,7 @@ def research(
     max_tokens: int = None,
     temperature: int = None,
     engine: str = "gpt-3.5-turbo",
+    embedding_model: EmbeddingModel = EmbeddingModel.spacy,
 ) -> str:
     prompt = f"\"{prompt}\""
     max_compl_tokens =  max_tokens or DEFAULT_OPENAI_SETTINGS["max_compl_tokens"]
@@ -1121,6 +1150,7 @@ def research(
         max_add_words=max_add_words,
         google_api_key=os.getenv("GOOGLE_SEARCH_API_KEY"),
         google_engine=os.getenv("GOOGLE_SEARCH_ENGINE_ID"),
+        embedding_model=embedding_model,
     )
 
     # Truncate additional information to stay within the chat completion token limit of 4096

diff --git a/evo_researcher/benchmark/agents.py b/evo_researcher/benchmark/agents.py
@@ -4,8 +4,10 @@
 import typing as t
 
 from evo_researcher.functions.evaluate_question import evaluate_question, EvalautedQuestion
+from evo_researcher.functions.rephrase_question import rephrase_question
 from evo_researcher.functions.research import research as research_evo
 from evo_researcher.autonolas.research import (
+    EmbeddingModel,
     make_prediction,
     Prediction as LLMCompletionPredictionDict,
     research as research_autonolas,
@@ -53,7 +55,7 @@ def evaluate(self, market_question: str) -> EvalautedQuestion:
 
     def research(self, market_question: str) -> t.Optional[str]:
         raise NotImplementedError
-    
+
     def predict(self, market_question: str, researched: str, evaluated: EvalautedQuestion) -> Prediction:
         raise NotImplementedError
 
@@ -70,11 +72,13 @@ def evaluate_research_predict(self, market_question: str) -> Prediction:
             evaluated=eval,
         )
 
+
 class OlasAgent(AbstractBenchmarkedAgent):
-    def __init__(self, model: str, temperature: float, agent_name: str = "olas", max_workers: t.Optional[int] = None):
+    def __init__(self, model: str, temperature: float, agent_name: str = "olas", max_workers: t.Optional[int] = None, embedding_model: EmbeddingModel = EmbeddingModel.spacy):
         super().__init__(agent_name=agent_name, max_workers=max_workers)
         self.model = model
         self.temperature = temperature
+        self.embedding_model = embedding_model
 
     def evaluate(self, market_question: str) -> EvalautedQuestion:
         return evaluate_question(question=market_question)
@@ -84,6 +88,7 @@ def research(self, market_question: str) -> t.Optional[str]:
             return research_autonolas(
                 prompt=market_question,
                 engine=self.model,
+                embedding_model=self.embedding_model,
             )
         except ValueError as e:
             print(f"Error in OlasAgent's research: {e}")
@@ -141,7 +146,45 @@ def predict(self, market_question: str, researched: str, evaluated: EvalautedQue
             return Prediction(evaluation=evaluated)
 
 
+class RephrasingOlasAgent(OlasAgent):
+    def __init__(
+        self,
+        model: str,
+        temperature: float,
+        agent_name: str = "reph-olas",
+        max_workers: t.Optional[int] = None,
+        embedding_model: EmbeddingModel = EmbeddingModel.spacy,
+    ):
+        super().__init__(
+            model=model,
+            temperature=temperature,
+            embedding_model=embedding_model,
+            agent_name=agent_name,
+            max_workers=max_workers,
+        )
+
+    def research(self, market_question: str) -> t.Optional[str]:
+        questions = rephrase_question(question=market_question)
+
+        report_original = super().research(market_question=questions.original_question)
+        report_negated = super().research(market_question=questions.negated_question)
+        report_universal = super().research(market_question=questions.open_ended_question)
+
+        report_concat = "\n\n---\n\n".join([
+            f"### {r_name}\n\n{r}"
+            for r_name, r in [
+                ("Research based on the question", report_original), 
+                ("Research based on the negated question", report_negated), 
+                ("Research based on the universal search query", report_universal)
+            ] 
+            if r is not None
+        ])
+
+        return report_concat
+
+
 AGENTS = [
     OlasAgent,
+    RephrasingOlasAgent,
     EvoAgent,
 ]
diff --git a/evo_researcher/functions/parallelism.py b/evo_researcher/functions/parallelism.py
@@ -0,0 +1,23 @@
+import os
+import concurrent
+from typing import Callable, TypeVar
+from concurrent.futures.thread import ThreadPoolExecutor
+
+THREADPOOL = ThreadPoolExecutor(int(os.getenv("THREADPOOL_N_THREADS", 50)))
+
+A = TypeVar("A")
+B = TypeVar("B")
+
+def par_map(
+    items: list[A], func: Callable[[A], B], executor: concurrent.futures.Executor = THREADPOOL
+) -> "list[B]":
+    """Applies the function to each element using the specified executor. Awaits for all results.
+    If executor is ProcessPoolExecutor, make sure the function passed is pickable, e.g. no lambda functions
+    """
+    futures: list[concurrent.futures._base.Future[B]] = [
+        executor.submit(func, item) for item in items
+    ]
+    results = []
+    for fut in futures:
+        results.append(fut.result())
+    return results
diff --git a/evo_researcher/functions/rephrase_question.py b/evo_researcher/functions/rephrase_question.py
@@ -0,0 +1,54 @@
+import json
+import tiktoken
+from pydantic import BaseModel
+from langchain.llms import OpenAI
+from langchain_openai import ChatOpenAI
+from evo_researcher.autonolas.research import clean_completion_json
+from langchain.prompts import ChatPromptTemplate
+
+
+QUESTION_REPHRASE_PROMPT = """We have the following question: `{question}`
+
+Write a dictionary with following keys, don't answer the question, only rewrite it in the following ways:
+
+```
+- open_ended_question: Ask the question universally
+- negated_question Ask the question in negation
+```
+"""
+
+
+class RephrasedQuestion(BaseModel):
+    original_question: str
+    negated_question: str
+    open_ended_question: str
+
+
+def rephrase_question(
+    question: str,
+    engine: str = "gpt-4-0125-preview"
+) -> RephrasedQuestion:
+    """
+    Rephrase the original question, by asking it in negation and universally, for example:
+
+    original_question: Is the sky blue?
+    negated_question: Is the sky not blue?
+    open_ended_question: What is the color of the sky?
+    """
+    tokenizer = tiktoken.encoding_for_model(engine)
+    llm = ChatOpenAI(model=engine, temperature=0.0)
+
+    prompt = ChatPromptTemplate.from_template(template=QUESTION_REPHRASE_PROMPT)
+    messages = prompt.format_messages(question=question)
+
+    max_tokens = 2 * len(tokenizer.encode(question)) + 50 # Max tokens as the question two times + some buffer for formatting.
+    completion = llm(messages, max_tokens=max_tokens).content
+
+    try:
+        return RephrasedQuestion(
+            original_question=question, 
+            **json.loads(clean_completion_json(completion))
+        )
+    except json.decoder.JSONDecodeError as e:
+        raise ValueError(f"Error in rephrase_question for `{question}`: {completion}") from e
+
diff --git a/evo_researcher/functions/research.py b/evo_researcher/functions/research.py
@@ -20,7 +20,7 @@ def research(
     top_k_per_query: int = 8
 ) -> tuple[str, str]:    
     queries = generate_subqueries(query=goal, limit=initial_subqueries_limit, api_key=openai_key)
-    queries = rerank_subqueries(queries=queries, goal=goal, api_key=openai_key)[:subqueries_limit]
+    queries = rerank_subqueries(queries=queries, goal=goal, api_key=openai_key)[:subqueries_limit] if initial_subqueries_limit > subqueries_limit else queries
 
     search_results_with_queries = search(queries, tavily_key, lambda result: not result["url"].startswith("https://www.youtube"))