-
Notifications
You must be signed in to change notification settings - Fork 240
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
105 changed files
with
10,490 additions
and
9,945 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import annotations | ||
|
||
import re | ||
from typing import TYPE_CHECKING | ||
|
||
if TYPE_CHECKING: | ||
from collections.abc import Iterator | ||
|
||
from pyglossary import slob | ||
from pyglossary.glossary_types import EntryType, GlossaryType | ||
|
||
from pyglossary.core import exc_note, log, pip | ||
from pyglossary.plugins.aard2_slob.tags import ( | ||
supported_tags, | ||
t_copyright, | ||
t_created_at, | ||
t_created_by, | ||
t_edition, | ||
t_label, | ||
t_license_name, | ||
t_license_url, | ||
t_uri, | ||
) | ||
|
||
|
||
class Reader: | ||
depends = { | ||
"icu": "PyICU", # >=1.5 | ||
} | ||
|
||
def __init__(self, glos: GlossaryType) -> None: | ||
self._glos = glos | ||
self._clear() | ||
self._re_bword = re.compile( | ||
"(<a href=[^<>]+?>)", | ||
re.IGNORECASE, | ||
) | ||
|
||
def close(self) -> None: | ||
if self._slobObj is not None: | ||
self._slobObj.close() | ||
self._clear() | ||
|
||
def _clear(self) -> None: | ||
self._filename = "" | ||
self._slobObj: slob.Slob | None = None | ||
|
||
# TODO: PLR0912 Too many branches (13 > 12) | ||
def open(self, filename: str) -> None: # noqa: PLR0912 | ||
try: | ||
import icu # type: ignore # noqa: F401 | ||
except ModuleNotFoundError as e: | ||
exc_note(e, f"Run `{pip} install PyICU` to install") | ||
raise | ||
from pyglossary import slob | ||
|
||
self._filename = filename | ||
self._slobObj = slob.open(filename) | ||
tags = dict(self._slobObj.tags.items()) | ||
|
||
if t_label in tags: | ||
self._glos.setInfo("name", tags[t_label]) | ||
|
||
if t_created_at in tags: | ||
self._glos.setInfo("creationTime", tags[t_created_at]) | ||
|
||
if t_created_by in tags: | ||
self._glos.setInfo("author", tags[t_created_by]) | ||
|
||
copyrightLines: list[str] = [] | ||
for key in (t_copyright, t_license_name, t_license_url): | ||
try: | ||
value = tags.pop(key) | ||
except KeyError: | ||
continue | ||
copyrightLines.append(value) | ||
if copyrightLines: | ||
self._glos.setInfo("copyright", "\n".join(copyrightLines)) | ||
|
||
if t_uri in tags: | ||
self._glos.setInfo("website", tags[t_uri]) | ||
|
||
if t_edition in tags: | ||
self._glos.setInfo("edition", tags[t_edition]) | ||
|
||
for key, value in tags.items(): | ||
if key in supported_tags: | ||
continue | ||
self._glos.setInfo(f"slob.{key}", value) | ||
|
||
def __len__(self) -> int: | ||
if self._slobObj is None: | ||
log.error("called len() on a reader which is not open") | ||
return 0 | ||
return len(self._slobObj) | ||
|
||
@staticmethod | ||
def _href_sub(m: re.Match) -> str: | ||
st = m.group(0) | ||
if "//" in st: | ||
return st | ||
return st.replace('href="', 'href="bword://').replace( | ||
"href='", | ||
"href='bword://", | ||
) | ||
|
||
def __iter__(self) -> Iterator[EntryType | None]: | ||
from pyglossary.slob import MIME_HTML, MIME_TEXT | ||
|
||
if self._slobObj is None: | ||
raise RuntimeError("iterating over a reader while it's not open") | ||
|
||
slobObj = self._slobObj | ||
blobSet = set() | ||
|
||
# slob library gives duplicate blobs when iterating over slobObj | ||
# even keeping the last id is not enough, since duplicate blobs | ||
# are not all consecutive. so we have to keep a set of blob IDs | ||
|
||
for blob in slobObj: | ||
id_ = blob.identity | ||
if id_ in blobSet: | ||
yield None # update progressbar | ||
continue | ||
blobSet.add(id_) | ||
|
||
# blob.key is str, blob.content is bytes | ||
word = blob.key | ||
|
||
ctype = blob.content_type.split(";")[0] | ||
if ctype not in {MIME_HTML, MIME_TEXT}: | ||
log.debug(f"unknown {blob.content_type=} in {word=}") | ||
word = word.removeprefix("~/") | ||
yield self._glos.newDataEntry(word, blob.content) | ||
continue | ||
defiFormat = "" | ||
if ctype == MIME_HTML: | ||
defiFormat = "h" | ||
elif ctype == MIME_TEXT: | ||
defiFormat = "m" | ||
|
||
defi = blob.content.decode("utf-8") | ||
defi = self._re_bword.sub(self._href_sub, defi) | ||
yield self._glos.newEntry(word, defi, defiFormat=defiFormat) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
t_created_at = "created.at" | ||
t_label = "label" | ||
t_created_by = "created.by" | ||
t_copyright = "copyright" | ||
t_license_name = "license.name" | ||
t_license_url = "license.url" | ||
t_uri = "uri" | ||
t_edition = "edition" | ||
|
||
supported_tags = { | ||
t_label, | ||
t_created_at, | ||
t_created_by, | ||
t_copyright, | ||
t_uri, | ||
t_edition, | ||
} | ||
|
||
__all__ = [ | ||
"supported_tags", | ||
"t_copyright", | ||
"t_created_at", | ||
"t_created_by", | ||
"t_edition", | ||
"t_label", | ||
"t_license_name", | ||
"t_license_url", | ||
"t_uri", | ||
] |
Oops, something went wrong.