forked from SquareAndCompass-2/R2R
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ingestion.py
72 lines (62 loc) · 2.12 KB
/
ingestion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from abc import abstractmethod
from typing import Any, Iterator, Optional
from ..abstractions.document import BasicDocument
from ..adapters import Adapter
from ..providers.logging import LoggingDatabaseConnection
from ..utils import generate_run_id
from .pipeline import Pipeline
class IngestionPipeline(Pipeline):
def __init__(
self,
adapters: Optional[dict[Any, Adapter]] = None,
logging_connection: Optional[LoggingDatabaseConnection] = None,
*args,
**kwargs,
):
super().__init__(logging_connection=logging_connection, **kwargs)
def initialize_pipeline(self, *args, **kwargs) -> None:
self.pipeline_run_info = {
"run_id": generate_run_id(),
"type": "ingestion",
}
@property
@abstractmethod
def supported_types(self) -> list[str]:
"""
Returns a list of supported data types.
"""
pass
@abstractmethod
def process_data(
self, entry_type: str, entry_data: Any
) -> Iterator[BasicDocument]:
"""
Process data into plaintext based on the data type and yield BasicDocument objects.
"""
pass
@abstractmethod
def parse_entry(
self, entry_type: str, entry_data: Any
) -> Iterator[BasicDocument]:
"""
Parse entry data into plaintext based on the entry type and yield BasicDocument objects.
"""
pass
def run(
self,
document_id: str,
blobs: dict[str, Any],
metadata: Optional[dict] = None,
**kwargs,
) -> Iterator[BasicDocument]:
"""
Run the appropriate parsing method based on the data type and whether the data is a file or an entry.
Yields the processed BasicDocument objects.
"""
self.initialize_pipeline()
if len(blobs) == 0:
raise ValueError("No blobs provided to process.")
for entry_type, blob in blobs.items():
if entry_type not in self.supported_types:
raise ValueError(f"IngestionType {entry_type} not supported.")
yield from self.parse_entry(entry_type, blob)