diff --git a/evo_researcher/benchmark/benchmark.py b/evo_researcher/benchmark/benchmark.py index 16f016b5..1cf2820e 100644 --- a/evo_researcher/benchmark/benchmark.py +++ b/evo_researcher/benchmark/benchmark.py @@ -15,10 +15,11 @@ ) from evo_researcher.benchmark.utils import ( Market, + MarketSource, Prediction, PredictionsCache, get_llm_api_call_cost, - get_manifold_markets, + get_markets, ) @@ -61,7 +62,6 @@ def __init__( "Mean info_utility": self._compute_mean_info_utility, "Mean cost ($)": self._compute_mean_cost, "Mean time (s)": self._compute_mean_time, - # b) correlation between confidence and prediction error relative to the reference } self.metric_fns.update(predefined_metric_fns) @@ -221,7 +221,7 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str]]: markets_summary[f"{model_type} p_yes"] = [ p.p_yes for p in self.predictions[model_type].values() ] - markets_summary["manifold p_yes"] = [m.p_yes for m in self.markets] + markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets] return markets_summary def generate_markdown_report(self): @@ -241,10 +241,16 @@ def generate_markdown_report(self): type=str, default="./benchmark_report.md", ) + args.add_argument( + "--reference", + type=str, + choices=[ms.value for ms in MarketSource], + default="manifold", + ) args = args.parse_args() benchmarker = Benchmarker( - markets=get_manifold_markets(number=3), + markets=get_markets(number=3, source=MarketSource(args.reference)), agents=[ OlasAgent(model="gpt-3.5-turbo"), # TODO use same models! EvoAgent(model="gpt-4-1106-preview"), diff --git a/evo_researcher/benchmark/utils.py b/evo_researcher/benchmark/utils.py index 513c191e..95fb9002 100644 --- a/evo_researcher/benchmark/utils.py +++ b/evo_researcher/benchmark/utils.py @@ -1,10 +1,18 @@ +from dotenv import load_dotenv +from enum import Enum import os import requests import typing as t from pydantic import BaseModel +class MarketSource(Enum): + MANIFOLD = "manifold" + POLYMARKET = "polymarket" + + class Market(BaseModel): + source: MarketSource question: str url: str p_yes: float @@ -53,19 +61,23 @@ def load(cls, markets: t.List[Market], path: str): } -def get_manifold_markets(number: int = 100) -> t.List[Market]: +def get_manifold_markets( + number: int = 100, excluded_questions: t.List[str] = [] +) -> t.List[Market]: url = "https://api.manifold.markets/v0/search-markets" params = { "term": "", "sort": "liquidity", "filter": "open", - "limit": f"{number}", + "limit": f"{number + len(excluded_questions)}", "contractType": "BINARY", # TODO support CATEGORICAL markets } response = requests.get(url, params=params) response.raise_for_status() markets_json = response.json() + for m in markets_json: + m["source"] = MarketSource.MANIFOLD # Map JSON fields to Market fields fields_map = { @@ -78,10 +90,61 @@ def _map_fields(old: dict, mapping: dict) -> dict: markets = [Market.parse_obj(_map_fields(m, fields_map)) for m in markets_json] markets = [m for m in markets if not m.is_resolved] - assert len(markets) == number + + # Filter out markets with excluded questions + markets = [m for m in markets if m.question not in excluded_questions] + + return markets[:number] + + +def get_polymarket_markets( + number: int = 100, excluded_questions: t.List[str] = [] +) -> t.List[Market]: + if number > 100: + raise ValueError("Polymarket API only returns 100 markets at a time") + + api_uri = f"https://strapi-matic.poly.market/markets?_limit={number + len(excluded_questions)}&active=true&closed=false" + ms_json = requests.get(api_uri).json() + markets: t.List[Market] = [] + for m_json in ms_json: + # Skip non-binary markets. Unfortunately no way to filter in the API call + if m_json["outcomes"] != ["Yes", "No"]: + continue + + if m_json["question"] in excluded_questions: + print(f"Skipping market with 'excluded question': {m_json['question']}") + continue + + markets.append( + Market( + question=m_json["question"], + url=f"https://polymarket.com/event/{m_json['slug']}", + p_yes=m_json["outcomePrices"][0], + volume=m_json["volume"], + is_resolved=False, + source=MarketSource.POLYMARKET, + ) + ) return markets +def get_markets( + number: int, + source: MarketSource, + excluded_questions: t.List[str] = [], +) -> t.List[Market]: + if source == MarketSource.MANIFOLD: + return get_manifold_markets( + number=number, excluded_questions=excluded_questions + ) + elif source == MarketSource.POLYMARKET: + return get_polymarket_markets( + number=number, excluded_questions=excluded_questions + ) + else: + raise ValueError(f"Unknown market source: {source}") + + def get_llm_api_call_cost(model: str, prompt_tokens: int, completion_tokens) -> float: """ In older versions of langchain, the cost calculation doesn't work for