-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from alexandrainst/domsdatabasen
Domsdatabasen API
- Loading branch information
Showing
9 changed files
with
2,834 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -113,3 +113,6 @@ models/* | |
|
||
# Dotenv file with name and email | ||
.name_and_email | ||
|
||
|
||
znotes.md |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
"""Configuration for the domsdatabasen package.""" | ||
|
||
from omegaconf import DictConfig, OmegaConf | ||
|
||
config: DictConfig = OmegaConf.create( | ||
{ | ||
"scrape": { | ||
"paths": {"download_dir": "download_tmp/", "test_dir": "tmp/"}, | ||
"force": False, | ||
"case_id": "1", | ||
"all": False, | ||
"start_case_id": "3962", | ||
"messages": { | ||
"give_correct_input": "Please specify either a 'case_id' or use 'all' to scrape all cases.\n", | ||
"done": "Scraping done!\n", | ||
}, | ||
"test_case_name": "test_case", | ||
"test_case_id": "1", | ||
"sleep": 5, | ||
"max_consecutive_nonexistent_page_count": 100, | ||
"timeout_pdf_download": 10, | ||
}, | ||
"process": { | ||
"paths": { | ||
"test_data_raw_dir": "tests/data/processor/raw/", | ||
"test_data_processed_dir": "tests/data/processor/processed", | ||
"blacklist": "data/blacklists/process.jsonl", | ||
}, | ||
"force": False, | ||
"case_id": "1", | ||
"all": False, | ||
"start_case_id": "2732", | ||
"blacklist_flag": False, | ||
"test_case_id": "1", | ||
"page_number": False, | ||
"gpu": False, | ||
"max_y_difference": 25, | ||
"neighbor_distance_max": 1, | ||
"box_area_min": 2500, | ||
"box_height_min": 35, | ||
"box_height_upper": 110, | ||
"box_width_min": 35, | ||
"box_accept_ratio": 0.6, | ||
"box_split_white_space": 7, | ||
"shift_up": 0, | ||
"iou_overlap_threshold": 0.5, | ||
"indices_to_split_edge_min_length": 5, | ||
"edge_accept_ratio": 0.95, | ||
"indices_to_split_row_diff": 45, | ||
"max_scale": 1.5, | ||
"anonymized_box_crop_padding": 3, | ||
"make_split_between_overlapping_box_and_line_height_max": 30, | ||
"box_split_delta": 2, | ||
"threshold_binarize_process_image": 50, | ||
"threshold_binarize_anonymized_boxes": 75, | ||
"threshold_binarize_empty_box": 100, | ||
"threshold_binarize_top_page": 230, | ||
"threshold_binarize_process_crop": 200, | ||
"threshold_gap": 11, | ||
"threshold_box_confidence": 0.3, | ||
"threshold_remove_boundary_height": 20, | ||
"threshold_remove_boundary_length": 50, | ||
"threshold_remove_boundary_closely_square": 3, | ||
"threshold_remove_boundary_too_few_pixels": 10, | ||
"threshold_footnote_height": 30, | ||
"invert_find_anonymized_boxes": False, | ||
"invert_find_underline_anonymizations": True, | ||
"underline_length_min": 26, | ||
"underline_height_lower_bound": 2, | ||
"underline_height_upper_bound": 7, | ||
"underline_remove_pad": 1, | ||
"underline_box_height": 50, | ||
"underline_box_height_min": 32, | ||
"underline_box_expand": 3, | ||
"threshold_binarize_line_anonymization": 255, | ||
"line_start_ignore_col": 1250, | ||
"line_start_ignore_row": 3000, | ||
"remove_cell_border": 5, | ||
"cell_box_shrink": 5, | ||
"cell_box_crop_padding": 3, | ||
"cell_multiple_lines_gap_threshold": 10, | ||
"remove_table_border": 7, | ||
"threshold_binarize_process_before_table_search": 1, | ||
"origin_box": "box", | ||
"origin_underline": "underline", | ||
"page_from_top_to_this_row": 500, | ||
"logo_bbox_area_threshold": 50000, | ||
}, | ||
"finalize": {"force": False}, | ||
"domsdatabasen": {"url": "https://domsdatabasen.dk/#sag"}, | ||
"paths": { | ||
"hf_hub": "alexandrainst/domsdatabasen", | ||
"data_raw_dir": "data/raw/", | ||
"data_processed_dir": "data/processed", | ||
"data_final_dir": "data/final", | ||
}, | ||
"file_names": { | ||
"tabular_data": "tabular_data.json", | ||
"pdf_document": "document.pdf", | ||
"processed_data": "processed_data.json", | ||
"dataset": "dataset.jsonl", | ||
}, | ||
"anon_method": {"underline": "underline", "box": "box", "none": "none"}, | ||
"testing": False, | ||
} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
"""API for accessing processed data from Domsdatabasen.""" | ||
|
||
from logging import getLogger | ||
from typing import Union | ||
|
||
from datasets import load_dataset | ||
from domsdatabasen import DatasetBuilder, Processor, Scraper | ||
from omegaconf import DictConfig | ||
|
||
from .config_domsdatabasen import config | ||
|
||
logger = getLogger(__name__) | ||
|
||
|
||
class Domsdatabasen: | ||
"""API for accessing processed data from Domsdatabasen. | ||
Attributes: | ||
config (DictConfig): | ||
Configuration settings object. | ||
scraper (Scraper): | ||
Scraper object for scraping data from Domsdatabasen. | ||
processor (Processor): | ||
Processor object for processing scraped data. | ||
dataset_builder (DatasetBuilder): | ||
DatasetBuilder object for building dataset samples. | ||
dataset (Dataset): | ||
Dataset of processed data from Domsdatabasen. | ||
""" | ||
|
||
def __init__(self): | ||
"""Initialize.""" | ||
self.config: DictConfig = config | ||
self.dataset = load_dataset("alexandrainst/domsdatabasen", split="train") | ||
|
||
# The following objects will not be initialized until | ||
# the first time they are needed. | ||
self.scraper = None | ||
self.processor = None | ||
self.dataset_builder = None | ||
|
||
def get_case(self, case_id: Union[str, int]) -> dict: | ||
"""Get processed data for a case from Domsdatabasen. | ||
If the case_id is already in the dataset, the data is returned from the dataset. | ||
Else, the case will be scraped and processed. | ||
Args: | ||
case_id (str, int): | ||
The case_id of the case to get data for. | ||
Returns: | ||
dataset_sample (dict): | ||
Processed data for the case. | ||
""" | ||
if isinstance(case_id, int): | ||
case_id = str(case_id) | ||
|
||
# Check if case_id is already in dataset | ||
for dataset_sample in self.dataset: | ||
if dataset_sample["case_id"] == case_id: | ||
logger.info(f"Found case_id {case_id} in cached dataset.") | ||
return dataset_sample | ||
|
||
# If case_id is not in dataset, scrape and process the case | ||
logger.info( | ||
f"Case_id {case_id} not found in cached dataset. " | ||
"Scraping and processing the case..." | ||
) | ||
self._initialize_objects() | ||
self.scraper.scrape(case_id=case_id) | ||
processed_data = self.processor.process(case_id=case_id) | ||
dataset_sample = self.dataset_builder.make_dataset_sample( | ||
processed_data=processed_data | ||
) | ||
|
||
return dataset_sample | ||
|
||
def _initialize_objects(self): | ||
"""Initialize Scraper, Processor and DatasetBuilder objects. | ||
We don't want to initialize these objects before they are needed. | ||
""" | ||
if self.scraper is not None: | ||
return | ||
|
||
self.scraper = Scraper(config=self.config) | ||
self.processor = Processor(config=self.config) | ||
self.dataset_builder = DatasetBuilder(config=self.config) |