diff --git a/evo_researcher/benchmark/benchmark.py b/evo_researcher/benchmark/benchmark.py
index 16f016b5..1cf2820e 100644
--- a/evo_researcher/benchmark/benchmark.py
+++ b/evo_researcher/benchmark/benchmark.py
@@ -15,10 +15,11 @@
 )
 from evo_researcher.benchmark.utils import (
     Market,
+    MarketSource,
     Prediction,
     PredictionsCache,
     get_llm_api_call_cost,
-    get_manifold_markets,
+    get_markets,
 )
 
 
@@ -61,7 +62,6 @@ def __init__(
             "Mean info_utility": self._compute_mean_info_utility,
             "Mean cost ($)": self._compute_mean_cost,
             "Mean time (s)": self._compute_mean_time,
-            # b) correlation between confidence and prediction error relative to the reference
         }
         self.metric_fns.update(predefined_metric_fns)
 
@@ -221,7 +221,7 @@ def get_markets_summary(self) -> t.Dict[str, t.List[str]]:
             markets_summary[f"{model_type} p_yes"] = [
                 p.p_yes for p in self.predictions[model_type].values()
             ]
-        markets_summary["manifold p_yes"] = [m.p_yes for m in self.markets]
+        markets_summary[f"reference p_yes"] = [m.p_yes for m in self.markets]
         return markets_summary
 
     def generate_markdown_report(self):
@@ -241,10 +241,16 @@ def generate_markdown_report(self):
         type=str,
         default="./benchmark_report.md",
     )
+    args.add_argument(
+        "--reference",
+        type=str,
+        choices=[ms.value for ms in MarketSource],
+        default="manifold",
+    )
     args = args.parse_args()
 
     benchmarker = Benchmarker(
-        markets=get_manifold_markets(number=3),
+        markets=get_markets(number=3, source=MarketSource(args.reference)),
         agents=[
             OlasAgent(model="gpt-3.5-turbo"),  # TODO use same models!
             EvoAgent(model="gpt-4-1106-preview"),
diff --git a/evo_researcher/benchmark/utils.py b/evo_researcher/benchmark/utils.py
index 513c191e..95fb9002 100644
--- a/evo_researcher/benchmark/utils.py
+++ b/evo_researcher/benchmark/utils.py
@@ -1,10 +1,18 @@
+from dotenv import load_dotenv
+from enum import Enum
 import os
 import requests
 import typing as t
 from pydantic import BaseModel
 
 
+class MarketSource(Enum):
+    MANIFOLD = "manifold"
+    POLYMARKET = "polymarket"
+
+
 class Market(BaseModel):
+    source: MarketSource
     question: str
     url: str
     p_yes: float
@@ -53,19 +61,23 @@ def load(cls, markets: t.List[Market], path: str):
         }
 
 
-def get_manifold_markets(number: int = 100) -> t.List[Market]:
+def get_manifold_markets(
+    number: int = 100, excluded_questions: t.List[str] = []
+) -> t.List[Market]:
     url = "https://api.manifold.markets/v0/search-markets"
     params = {
         "term": "",
         "sort": "liquidity",
         "filter": "open",
-        "limit": f"{number}",
+        "limit": f"{number + len(excluded_questions)}",
         "contractType": "BINARY",  # TODO support CATEGORICAL markets
     }
     response = requests.get(url, params=params)
 
     response.raise_for_status()
     markets_json = response.json()
+    for m in markets_json:
+        m["source"] = MarketSource.MANIFOLD
 
     # Map JSON fields to Market fields
     fields_map = {
@@ -78,10 +90,61 @@ def _map_fields(old: dict, mapping: dict) -> dict:
 
     markets = [Market.parse_obj(_map_fields(m, fields_map)) for m in markets_json]
     markets = [m for m in markets if not m.is_resolved]
-    assert len(markets) == number
+
+    # Filter out markets with excluded questions
+    markets = [m for m in markets if m.question not in excluded_questions]
+
+    return markets[:number]
+
+
+def get_polymarket_markets(
+    number: int = 100, excluded_questions: t.List[str] = []
+) -> t.List[Market]:
+    if number > 100:
+        raise ValueError("Polymarket API only returns 100 markets at a time")
+
+    api_uri = f"https://strapi-matic.poly.market/markets?_limit={number + len(excluded_questions)}&active=true&closed=false"
+    ms_json = requests.get(api_uri).json()
+    markets: t.List[Market] = []
+    for m_json in ms_json:
+        # Skip non-binary markets. Unfortunately no way to filter in the API call
+        if m_json["outcomes"] != ["Yes", "No"]:
+            continue
+
+        if m_json["question"] in excluded_questions:
+            print(f"Skipping market with 'excluded question': {m_json['question']}")
+            continue
+
+        markets.append(
+            Market(
+                question=m_json["question"],
+                url=f"https://polymarket.com/event/{m_json['slug']}",
+                p_yes=m_json["outcomePrices"][0],
+                volume=m_json["volume"],
+                is_resolved=False,
+                source=MarketSource.POLYMARKET,
+            )
+        )
     return markets
 
 
+def get_markets(
+    number: int,
+    source: MarketSource,
+    excluded_questions: t.List[str] = [],
+) -> t.List[Market]:
+    if source == MarketSource.MANIFOLD:
+        return get_manifold_markets(
+            number=number, excluded_questions=excluded_questions
+        )
+    elif source == MarketSource.POLYMARKET:
+        return get_polymarket_markets(
+            number=number, excluded_questions=excluded_questions
+        )
+    else:
+        raise ValueError(f"Unknown market source: {source}")
+
+
 def get_llm_api_call_cost(model: str, prompt_tokens: int, completion_tokens) -> float:
     """
     In older versions of langchain, the cost calculation doesn't work for