From 607a1edaf84387f52db54a0a83f3c5f5c2b33f50 Mon Sep 17 00:00:00 2001 From: dnth Date: Tue, 3 Dec 2024 14:20:00 +0800 Subject: [PATCH] add bm25 as dep --- nbs/bm25_coco-captions.ipynb | 28 ++++++++++++++-------------- pyproject.toml | 2 ++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/nbs/bm25_coco-captions.ipynb b/nbs/bm25_coco-captions.ipynb index ae6093c..b634e12 100644 --- a/nbs/bm25_coco-captions.ipynb +++ b/nbs/bm25_coco-captions.ipynb @@ -100,22 +100,22 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-12-03 13:45:10.709\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mRunning BM25 retrieval benchmark\u001b[0m\n", - "\u001b[32m2024-12-03 13:45:10.710\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.datasets.coco\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1mCOCO validation dataset found in data/coco/val2017, skipping download\u001b[0m\n", - "\u001b[32m2024-12-03 13:45:11.360\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m80\u001b[0m - \u001b[1mTokenizing corpus\u001b[0m\n" + "\u001b[32m2024-12-03 14:19:16.198\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m76\u001b[0m - \u001b[1mRunning BM25 retrieval benchmark\u001b[0m\n", + "\u001b[32m2024-12-03 14:19:16.199\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.datasets.coco\u001b[0m:\u001b[36m__init__\u001b[0m:\u001b[36m26\u001b[0m - \u001b[1mCOCO validation dataset found in data/coco/val2017, skipping download\u001b[0m\n", + "\u001b[32m2024-12-03 14:19:17.082\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m80\u001b[0m - \u001b[1mTokenizing corpus\u001b[0m\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "29b5e9fdf18a492490207b5ff52c0a81", + "model_id": "1735603033a2470dbd43ff1291945485", "version_major": 2, "version_minor": 0 }, @@ -129,7 +129,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "865425dc2e9741fa8dbaea14914f3b9d", + "model_id": "4f37b05ce1f64318bde369abe6e59cdf", "version_major": 2, "version_minor": 0 }, @@ -143,7 +143,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "39033e76d4b84270a31a3b8cbcceb454", + "model_id": "35e0240cf8034a7383b8b149ec20f586", "version_major": 2, "version_minor": 0 }, @@ -157,7 +157,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "92a8a1a8fe424a28b96a7c452958abea", + "model_id": "6f6537967a0d4acea0e7a19245d8382c", "version_major": 2, "version_minor": 0 }, @@ -172,13 +172,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-12-03 13:45:11.532\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m89\u001b[0m - \u001b[1mPerforming retrieval\u001b[0m\n" + "\u001b[32m2024-12-03 14:19:17.273\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m89\u001b[0m - \u001b[1mPerforming retrieval\u001b[0m\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1fbc29e3bd9a4612abeed9c0a4917c3c", + "model_id": "c9dc899ac4e142ba836069b35fce2056", "version_major": 2, "version_minor": 0 }, @@ -192,7 +192,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ae54bc2681c94a23a37d633b280b9609", + "model_id": "d3aa36358e0d4d56859d27eef5a5f087", "version_major": 2, "version_minor": 0 }, @@ -206,7 +206,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3a1c113851f546219537b2e65bce70f1", + "model_id": "851a3d64913d4008ab5ffc1758c81f58", "version_major": 2, "version_minor": 0 }, @@ -221,7 +221,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2024-12-03 13:45:12.252\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mCalculating metrics\u001b[0m\n" + "\u001b[32m2024-12-03 14:19:18.052\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mxretrieval.core\u001b[0m:\u001b[36mrun_benchmark_bm25\u001b[0m:\u001b[36m92\u001b[0m - \u001b[1mCalculating metrics\u001b[0m\n" ] }, { @@ -264,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { diff --git a/pyproject.toml b/pyproject.toml index 3f6acbc..22f35c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ dependencies = [ "sentence-transformers>=3.3.0", "timm>=1.0.0", "accelerate>=1.1.0", + "bm25s>=0.2.5", + "pystemmer>=2.2.0.3", ] [build-system]