Initial commit

lexiforest · Apr 28, 2024 · 1f3cc4e · 1f3cc4e
commit 1f3cc4e
Show file tree

Hide file tree

Showing 28 changed files with 499 additions and 0 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+dist/
+ragbear.egg-info/
+**/__pycache__
+data/
+.DS_Store
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Oyster Oil
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,75 @@
+# 🐻 Ragbear
+
+## Why
+
+RAG is simple, as simple as the following 10 lines of code:
+
+```py
+
+```
+
+With `ragbear`, it's even simpler:
+
+```py
+ragbear.query("Who is th father of Luke Skywalker?")
+```
+
+But a lot fancier, too:
+
+```py
+ragbear.query(
+    "Who is the father of Luke Skywalker?",
+    model="http://localhost:8964",
+    source=[
+        {"data": "http://localhost:8888", "type": "meilisearch"},  # query a meilisearch database
+    ],
+    rerank="newest",
+    rewrite="query2doc",
+    template=template,
+)
+```
+
+But you will need to optimize towards your academic or commercial goals, you need a framework.
+However, [existing solutions are too heavy](link to reddit).
+
+Enter Ragbear, we follow the above pattern closely, but give you options to swap each parts
+of the pipeline. The code is concise and straightforward, no useless wrappers around wrappers.
+Just read it.
+
+Unlike LangChain, we do not try to encapsulate every solution out there, because there is
+actually not too much to wrap at all. We would like to call ragbear a patter, rather than
+a framework.
+
+## Implemented algorithms
+
+With ragbear, we want to incorporate the latest research ideas and put it into production.
+We have implemented the following algorithms, which you can easily apply in your app.
+
+- HyDE
+- REPLUG
+- Query2Doc
+
+For example, to use the REPLUG method, it's as easy as:
+
+```py
+import ragbear
+
+ans = ragbear.query("Where was Steve Jobs born?", rewrite="replug")
+```
+
+## Data
+
+We support the following data query backend
+
+- DuckDuckGo, via python package `duckduckgo_search`
+- Local Dataset, via ANN engines, aks vector databases
+    - hnswlib + sqlite
+    - faiss + sqlite
+    - pg_vector
+
+## Dependencies
+
+- An LLM API endpoint, you can use OpenAI API or local models with Ollama.
+- An data backend, use `duckduckgo_search` or local dataset.
+
+
diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,7 @@
+model: gpt-3
+embedding_model: gte
+index_engine: hnswlib
+query_rewrite:
+  - expand
+  - reform
+
diff --git a/examples/api.py b/examples/api.py
diff --git a/examples/simple.py b/examples/simple.py
@@ -0,0 +1,59 @@
+import ragbear
+
+
+ragbear.query("Did Steve Jobs receive a PhD degree?")
+# -> No. He did not receive a PhD degree.
+
+
+# Use duckduckgo as the data source
+prompt = "Did Steve Jobs receive a PhD degree?"
+ragbear.query(prompt, source="duckduckgo")
+
+
+# Use another model, you can use any openai compatible API endpoints
+ragbear.query(prompt, model="http://localhost:8964")
+ragbear.query(prompt, model="http://localhost:8964", openai_token="xxxxxx")
+
+
+# Use a local datasource, you need to ingest the data first, see ingest.md for details
+ragbear.query(
+    prompt,
+    source=[
+        {"data": "/tmp/data/wiki.hnsw", "type": "hnsw", "max_results": 10},  # query local hnsw indexed files
+        {"data": "/tmp/data/wiki.bm25", "type": "bm25", "max_results": 10},  # query local bm25 indexed files
+        {"data": "http://localhost:8888", "type": "meilisearch", "max_results": 10},  # query a meilisearch database
+    ]
+)
+
+
+# Rerank the documents before generation
+ragbear.query(prompt, rerank="similarity")
+ragbear.query(prompt, rerank="oldest")
+ragbear.query(prompt, rerank="newest")
+
+
+# Rewrite the prompt before querying
+ragbear.query(prompt, rewrite="query2doc")
+
+
+# Use a different template for generation
+template = """
+You are a helpful assistant, answer the following question with the references:
+
+Question: {question}
+References: {refs}
+"""
+ragbear.query(prompt, template=template)
+
+
+# Putting it all together
+ragbear.query(
+    "Who is the father of Luke Skywalker?",
+    model="http://localhost:8964",
+    source=[
+        {"data": "http://localhost:8888", "type": "meilisearch", "max_results": 10},  # query a meilisearch database
+    ],
+    rerank="newest",
+    rewrite="query2doc",
+    template=template,
+)
diff --git a/examples/with_config.py b/examples/with_config.py
@@ -0,0 +1,5 @@
+import ragbear
+
+bear = ragbear.from_config("config.yaml")
+ans = bear.query("Where was Steve Jobs born?", strategy="replug")
+print(ans)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,71 @@
+[project]
+name = "ragbear"
+version = "0.0.1a1"
+authors = [{ name = "Oyster Oil", email = "infinitesheldon@gmail.com" }]
+description = "Comprehensive RAG implementations"
+license = { file = "LICENSE" }
+dependencies = [
+    "fastapi",
+    "torch",
+    "loguru",
+    "pyyaml",
+    "readability-lxml",
+    "markdownify",
+    "duckduckgo-search",
+]
+readme = "README.md"
+requires-python = ">=3.8"
+urls = { "repository" = "https://github.com/oysteroil/ragbear" }
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+
+
+[project.optional-dependencies]
+dev = [ ]
+build = [ ]
+test = [
+    "pytest>=8.1.1,<9.0",
+]
+
+
+[build-system]
+requires = ["wheel", "setuptools"]
+build-backend = "setuptools.build_meta"
+
+
+[tool.setuptools]
+packages = ["ragbear"]
+package-data = {}
+
+
+[tool.pytest.ini_options]
+# pythonpath = [ "." ]
+asyncio_mode = "auto"
+
+
+[tool.ruff]
+line-length = 88
+
+[tool.ruff.lint]
+select = [
+    "E",  # pycodestyle
+    "F",  # Pyflakes
+    "UP",  # pyupgrade
+    "B",  # flake8-bugbear
+    "SIM",  # flake8-simplify
+    "I",  # isort
+]
+
+[tool.mypy]
+python_version = "3.8"
+ignore_missing_imports = true
+exclude = ["benchmark/", "docs/", "examples/", "scripts/", "tests/", "build/"]
diff --git a/ragbear/__init__.py b/ragbear/__init__.py
@@ -0,0 +1,4 @@
+__all__ = ["query"]
+
+
+from ._entry import query
diff --git a/ragbear/__main__.py b/ragbear/__main__.py
@@ -0,0 +1,8 @@
+import argparse
+
+parser = argparse.ArgumentParser()
+subparser = parser.add_subparsers(dest="action")
+
+ingest = subparser.add_parser("ingest")
+ingest.add_argument("--data")
+
diff --git a/ragbear/_entry.py b/ragbear/_entry.py
@@ -0,0 +1,45 @@
+from contextlib import suppress
+from typing import List, Literal, Optional, TypedDict, Union
+
+with suppress(ImportError):
+    pass
+
+
+class SourceVendorType(TypedDict):
+    data: str
+    type: str
+    max_results: Optional[int]
+
+
+SourceType = Union[Literal["duckduckgo"], List[SourceVendorType]]
+RerankType = Literal["similarity", "oldest", "newest"]
+RewriteType = Literal["query2doc"]
+
+
+def query(
+    prompt: str,
+    model: Optional[str] = None,
+    *,
+    openai_token: Optional[str] = None,
+    source: Optional[SourceType] = None,
+    rerank: Optional[RerankType] = None,
+    rewrite: Optional[RewriteType] = None,
+    template: Optional[str] = None,
+    embedding_model: Optional[str] = None
+):
+    """Query the LLM, and return the summarized result.
+
+    Args:
+        prompt: the query string
+        model: the model to use, by default use local ollama server.
+        source: The data source for retrival
+
+    """
+
+    # 1. preprocess the query
+
+    # 2. find relevant docs
+
+    # 3. query the LLM
+
+    return
diff --git a/ragbear/answer.py b/ragbear/answer.py
@@ -0,0 +1,5 @@
+from pydantic import BaseModel
+
+
+class Answer(BaseModel):
+    text: str
diff --git a/ragbear/api/__init__.py b/ragbear/api/__init__.py
diff --git a/ragbear/api/app.py b/ragbear/api/app.py
diff --git a/ragbear/config.py b/ragbear/config.py
@@ -0,0 +1,19 @@
+import yaml
+from pathlib import Path
+from typing import Literal, List
+
+
+from pydantic import BaseModel
+
+
+class Config(BaseModel):
+    index_engine: Literal["hnswlib", "faiss", "annoy"]
+    embedding_model: str
+    query_rewrite: List[Literal["expand"]]
+
+
+def read_config(p: str | Path):
+    with open(p) as f:
+        return yaml.safe_load(f)
+
+
diff --git a/ragbear/dababase/__init__.py b/ragbear/dababase/__init__.py
diff --git a/ragbear/dababase/find.py b/ragbear/dababase/find.py
@@ -0,0 +1,42 @@
+from typing import List, Optional
+from pydantic import BaseModel, AnyUrl, Url
+from sentence_transformers import SentenceTransformer
+from duckduckgo_search import DDGS
+
+from ragbear.rank import noop_rank
+
+
+model = SentenceTransformer("cyclone/simcse-chinese-roberta-wwm-ext")
+
+model.encode(["hehe", ""])
+
+
+class Doc(BaseModel):
+    title: str = ""
+    text: str = ""
+    full_text: str = ""
+    url: Optional[AnyUrl]= None
+    source: str = ""
+    source_type: str = ""
+    time_used: int
+
+
+def find(query: str, recall="bm25"):
+
+    # 1. vectorize the query string
+    vec = model.encode(str)
+
+    # 2. find docs in the index
+    docs = index.search(vec)
+
+    return docs
+
+
+def find_bm25(query: str):
+    ...
+
+
+def find_duckduckgo(query: str, max_results: int = 10) -> List[Doc]:
+    results = DDGS().text(query, max_results=max_results)
+    return docs
+
diff --git a/ragbear/dababase/index/__init__.py b/ragbear/dababase/index/__init__.py