Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integra base para segmentadores e segmentador da AMA #64

Merged
merged 5 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion main/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
from storage import create_storage_interface
from index import create_index_interface
from tasks import (
create_gazettes_index,
create_themed_excerpts_index,
embedding_rerank_excerpts,
extract_text_from_gazettes,
extract_themed_excerpts_from_gazettes,
get_gazettes_to_be_processed,
get_themes,
get_territories,
tag_entities_in_excerpts,
)

Expand Down Expand Up @@ -42,11 +45,15 @@ def execute_pipeline():
text_extractor = create_apache_tika_text_extraction()
themes = get_themes()

create_gazettes_index(index)
territories = get_territories(database)
gazettes_to_be_processed = get_gazettes_to_be_processed(execution_mode, database)
indexed_gazette_ids = extract_text_from_gazettes(
gazettes_to_be_processed, database, storage, index, text_extractor
gazettes_to_be_processed, territories, database, storage, index, text_extractor
)

for theme in themes:
create_themed_excerpts_index(theme, index)
themed_excerpt_ids = extract_themed_excerpts_from_gazettes(
theme, indexed_gazette_ids, index
)
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ requests==2.25.0
scikit-learn==1.0.2
sentence-transformers==2.2.0
huggingface-hub==0.10.1 # fix: https://github.com/UKPLab/sentence-transformers/issues/1762
python-slugify[unidecode]==8.0.1
5 changes: 5 additions & 0 deletions segmentation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .factory import get_segmenter

__all__ = [
"get_segmenter",
]
7 changes: 7 additions & 0 deletions segmentation/base/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from .gazette_segment import GazetteSegment
from .association_segmenter import AssociationSegmenter

__all__ = [
"GazetteSegment",
"AssociationSegmenter",
]
27 changes: 27 additions & 0 deletions segmentation/base/association_segmenter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from typing import Any, Dict, Iterable, List, Union
from segmentation.base import GazetteSegment


class AssociationSegmenter:
def __init__(self, territories: Iterable[Dict[str, Any]]):
self.territories = territories

def get_gazette_segments(self, *args, **kwargs) -> List[Union[GazetteSegment, Dict]]:
"""
Returns a list of GazetteSegment
"""
raise NotImplementedError

def split_text_by_territory(self, *args, **kwargs) -> Union[Dict[str, str], List[str]]:
"""
Segment a association text by territory
and returns a list of text segments
"""
raise NotImplementedError

def build_segment(self, *args, **kwargs) -> GazetteSegment:
"""
Returns a GazetteSegment
"""
raise NotImplementedError

27 changes: 27 additions & 0 deletions segmentation/base/gazette_segment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from datetime import date, datetime
from dataclasses import dataclass


@dataclass
class GazetteSegment:
"""
Dataclass to represent a gazette segment of a association
related to a city
"""
id: str
territory_name: str
source_text: str
date: date
edition_number: str
is_extra_edition: bool
power: str
file_checksum: str
scraped_at: datetime
created_at: datetime
processed: bool
file_path: str
file_url: str
state_code: str
territory_id: str
file_raw_txt: str
url: str
49 changes: 49 additions & 0 deletions segmentation/factory.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Any, Dict, Iterable

from segmentation.base import AssociationSegmenter
from segmentation import segmenters


_segmenter_instances = {}


def get_segmenter(territory_id: str, territories: Iterable[Dict[str, Any]]) -> AssociationSegmenter:
"""
Factory method to return a AssociationSegmenter

Example
-------
>>> territory_id = "9999999"
>>> territories = [
{
"id": "9999999",
"territory_name": "Bairro do Limoeiro",
"state_code": "ZZ",
"state": "Limoeirolândia",
}, {
"id": "0000000",
"territory_name": "Castelo Rá-Tim-Bum",
"state_code": "SP",
"state": "São Paulo",
},
]
>>> from segmentation import get_segmenter
>>> segmenter = get_segmenter(territory_id, territories)
>>> segments = segmenter.get_gazette_segments()

Notes
-----
This method implements a factory method pattern.
See: https://github.com/faif/python-patterns/blob/master/patterns/creational/factory.py
"""

territory_to_segmenter_class = {
"2700000": "ALAssociacaoMunicipiosSegmenter",
}

if territory_id not in _segmenter_instances:
segmenter_class_name = territory_to_segmenter_class[territory_id]
segmenter_class = getattr(segmenters, segmenter_class_name)
_segmenter_instances[territory_id] = segmenter_class(territories)

return _segmenter_instances[territory_id]
5 changes: 5 additions & 0 deletions segmentation/segmenters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from .al_associacao_municipios import ALAssociacaoMunicipiosSegmenter

__all__ = [
"ALAssociacaoMunicipiosSegmenter",
]
88 changes: 88 additions & 0 deletions segmentation/segmenters/al_associacao_municipios.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import re
import logging

from typing import Any, Dict, List
from segmentation.base import AssociationSegmenter, GazetteSegment
from tasks.utils import batched, get_checksum, get_territory_data, get_territory_slug


class ALAssociacaoMunicipiosSegmenter(AssociationSegmenter):
RE_NOMES_MUNICIPIOS = re.compile(
r"""
(ESTADO\sDE\sALAGOAS(?:|\s)\n{1,2}PREFEITURA\sMUNICIPAL\sDE\s) # Marcador de início do cabeçalho de publicação do município
((?!EDUCAÇÃO).*?\n{0,2}(?!VAMOS).*?$) # Nome do município (pode estar presente em até duas linhas). Exceções Notáveis: VAMOS, Poço das Trincheiras, 06/01/2022, ato CCB3A6AB; EDUCAÇÃO, Dois Riachos, 07/12/2023, ato ABCCE576
(\n\s(?:\s|SECRETARIA|Secretaria)) # Marcador de fim do cabeçalho (pula mais de duas linhas). Exceções Notáveis: SECRETARIA, Coité do Nóia, 02/10/2018, ato 12F7DE15; Secretaria, Qubrângulo, 18/07/2023, atos 27FB2D83 a 1FAF9421
""",
re.MULTILINE | re.VERBOSE,
)

def get_gazette_segments(self, gazette: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Returns a list of dicts with the gazettes metadata
"""
territory_to_text_map = self.split_text_by_territory(gazette["source_text"])
gazette_segments = [
self.build_segment(territory_slug, segment_text, gazette).__dict__
for territory_slug, segment_text in territory_to_text_map.items()
]
return gazette_segments

def split_text_by_territory(self, text: str) -> Dict[str, str]:
"""
Segment a association text by territory
and returns a dict with the territory name and the text segment
"""
ama_header = text.lstrip().split("\n", maxsplit=1)[0].rstrip()
# clean headers
clean_text = "\n".join(re.split(re.escape(ama_header), text))
# clean final lines
clean_text = "\n".join(
re.split(r"(Código Ide ?ntificador:\s*\w+)", clean_text)[:-1]
)

raw_segments = re.split(self.RE_NOMES_MUNICIPIOS, clean_text)[1:]

territory_to_text_map = {}
for pattern_batch in batched(raw_segments, 4):
territory_name = pattern_batch[1]
clean_territory_name = self._normalize_territory_name(territory_name)
territory_slug = get_territory_slug(clean_territory_name, "AL")
previous_text_or_header = territory_to_text_map.setdefault(
territory_slug, f"{ama_header}\n"
)
raw_batch_text = "".join(pattern_batch)
new_territory_text = f"{previous_text_or_header}\n{raw_batch_text}"
territory_to_text_map[territory_slug] = new_territory_text

return territory_to_text_map

def build_segment(
self, territory_slug: str, segment_text: str, gazette: Dict
) -> GazetteSegment:
logging.debug(
f"Creating segment for territory \"{territory_slug}\" from {gazette['file_path']} file."
)
territory_data = get_territory_data(territory_slug, self.territories)

return GazetteSegment(**{
**gazette,
# segment specific values
"processed": True,
"file_checksum": get_checksum(segment_text),
"source_text": segment_text.strip(),
"territory_name": territory_data["territory_name"],
"territory_id": territory_data["id"],
})

def _normalize_territory_name(self, territory_name: str) -> str:
clean_name = territory_name.strip().replace("\n", "")
# Alguns nomes de municípios possuem um /AL no final, exemplo: Viçosa no diário 2022-01-17, ato 8496EC0A. Para evitar erros como "vicosa-/al-secretaria-municipal...", a linha seguir remove isso.
clean_name = re.sub(
"\s*(\/AL.*|GABINETE DO PREFEITO.*|PODER.*|http.*|PORTARIA.*|Extrato.*|ATA DE.*|SECRETARIA.*|Fundo.*|SETOR.*|ERRATA.*|- AL.*|GABINETE.*|EXTRATO.*|SÚMULA.*|RATIFICAÇÃO.*)",
"",
clean_name,
)
name_to_fixed = {
"MAJOR IZIDORO": "MAJOR ISIDORO",
}
return name_to_fixed.get(clean_name, clean_name)
2 changes: 2 additions & 0 deletions tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .create_index import create_gazettes_index, create_themed_excerpts_index
from .gazette_excerpts_embedding_reranking import embedding_rerank_excerpts
from .gazette_excerpts_entities_tagging import tag_entities_in_excerpts
from .gazette_text_extraction import extract_text_from_gazettes
Expand All @@ -10,3 +11,4 @@
TextExtractorInterface,
)
from .list_gazettes_to_be_processed import get_gazettes_to_be_processed
from .list_territories import get_territories
Loading
Loading