Skip to content

Commit

Permalink
Remove duckdb and cli index selection
Browse files Browse the repository at this point in the history
  • Loading branch information
jbothma committed Nov 22, 2024
1 parent 47826b5 commit 5d69c23
Show file tree
Hide file tree
Showing 7 changed files with 10 additions and 443 deletions.
6 changes: 2 additions & 4 deletions nomenklatura/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from followthemoney.cli.aggregate import sorted_aggregate

from nomenklatura.cache import Cache
from nomenklatura.index import MEMORY_INDEX_PATH
from nomenklatura.index import Index
from nomenklatura.matching import train_v2_matcher, train_v1_matcher
from nomenklatura.store import load_entity_file_store
from nomenklatura.resolver import Resolver
Expand Down Expand Up @@ -64,7 +64,6 @@ def cli() -> None:
@click.option("-l", "--limit", type=click.INT, default=5000)
@click.option("--algorithm", default=DefaultAlgorithm.NAME)
@click.option("--scored/--unscored", is_flag=True, type=click.BOOL, default=True)
@click.option("-i", "--index", type=click.STRING)
@click.option(
"-c",
"--clear",
Expand All @@ -79,7 +78,7 @@ def xref_file(
algorithm: str = DefaultAlgorithm.NAME,
limit: int = 5000,
scored: bool = True,
index: str = MEMORY_INDEX_PATH,
index: str = Index.name,
clear: bool = False,
) -> None:
resolver_ = _get_resolver(path, resolver)
Expand All @@ -103,7 +102,6 @@ def xref_file(
algorithm=algorithm_type,
scored=scored,
limit=limit,
index_path=index,
)
resolver_.save()
log.info("Xref complete in: %s", resolver_.path)
Expand Down
37 changes: 1 addition & 36 deletions nomenklatura/index/__init__.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,5 @@
from importlib import import_module
import logging
from pathlib import Path
from typing import Type, Optional

from nomenklatura.index.index import Index
from nomenklatura.index.common import BaseIndex
from nomenklatura.store import View
from nomenklatura.dataset import DS
from nomenklatura.entity import CE

log = logging.getLogger(__name__)


MEMORY_INDEX_PATH = "nomenklatura.index.index.Index"


def get_index(
view: View[DS, CE], path: Path, class_path: Optional[str]
) -> BaseIndex[DS, CE]:
"""Get the best available index class to use."""
clazz: Type[BaseIndex[DS, CE]] = Index[DS, CE]
if class_path is not None:
try:
module_path, class_name = class_path.rsplit(".", 1)
module = import_module(module_path)
clazz_ref = getattr(module, class_name)

clazz = clazz_ref[DS, CE]
except ImportError:
log.warning(
"f`{class_path}` is not available, falling back to in-memory index."
)

index = clazz(view, path)
index.build()
return index


__all__ = ["BaseIndex", "Index", "MEMORY_INDEX_PATH", "get_index"]
__all__ = ["BaseIndex", "Index"]
240 changes: 0 additions & 240 deletions nomenklatura/index/duckdb_index.py

This file was deleted.

8 changes: 5 additions & 3 deletions nomenklatura/xref.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from followthemoney.schema import Schema
from pathlib import Path

from nomenklatura import Index
from nomenklatura.dataset import DS
from nomenklatura.entity import CE
from nomenklatura.store import Store
from nomenklatura.judgement import Judgement
from nomenklatura.resolver import Resolver
from nomenklatura.index import get_index
from nomenklatura.index import BaseIndex
from nomenklatura.matching import DefaultAlgorithm, ScoringAlgorithm
from nomenklatura.conflicting_match import ConflictingMatchReporter

Expand All @@ -31,6 +32,7 @@ def xref(
resolver: Resolver[CE],
store: Store[DS, CE],
index_dir: Path,
index_type: Type[BaseIndex[DS, CE]] = Index,
limit: int = 5000,
limit_factor: int = 10,
scored: bool = True,
Expand All @@ -41,12 +43,12 @@ def xref(
conflicting_match_threshold: Optional[float] = None,
focus_dataset: Optional[str] = None,
algorithm: Type[ScoringAlgorithm] = DefaultAlgorithm,
index_path: Optional[str] = None,
user: Optional[str] = None,
) -> None:
log.info("Begin xref: %r, resolver: %s", store, resolver)
view = store.default_view(external=external)
index = get_index(view, index_dir, index_path)
index = index_type(view, index_dir)
index.build()
conflict_reporter = None
if conflicting_match_threshold is not None:
conflict_reporter = ConflictingMatchReporter(
Expand Down
8 changes: 0 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from tempfile import mkdtemp

from nomenklatura import settings
from nomenklatura.index.duckdb_index import DuckDBIndex
from nomenklatura.index.tantivy_index import TantivyIndex
from nomenklatura.store import load_entity_file_store, SimpleMemoryStore
from nomenklatura.kv import get_redis
Expand Down Expand Up @@ -82,13 +81,6 @@ def tantivy_index(index_path: Path, dstore: SimpleMemoryStore):
yield index


@pytest.fixture(scope="function")
def duckdb_index(index_path: Path, dstore: SimpleMemoryStore):
index = DuckDBIndex(dstore.default_view(), index_path)
index.build()
yield index


@pytest.fixture(scope="function")
def index_path():
index_path = Path(mkdtemp()) / "index-dir"
Expand Down
Loading

0 comments on commit 5d69c23

Please sign in to comment.