diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d127f810d..24256bdbe 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -37,7 +37,8 @@ jobs: python-idzip \ lxml==5.3 \ marisa-trie \ - mistune + mistune \ + polib - name: Remove test cache run: rm -rf /home/runner/.cache/pyglossary/test || true - name: Run tests diff --git a/.gitignore b/.gitignore index 55f02db63..b5bccdf17 100644 --- a/.gitignore +++ b/.gitignore @@ -10,9 +10,10 @@ import-analyzer/ /.mypy_cache/ /plugins /ui -.coverage +*.cover* *,cover htmlcov +*.htmlcov vulture.* imports_from_set.json imports_set.json diff --git a/doc/p/__index__.md b/doc/p/__index__.md index a6baac76f..9828db61c 100644 --- a/doc/p/__index__.md +++ b/doc/p/__index__.md @@ -29,7 +29,6 @@ | Gettext Source (.po) | GettextPo | [gettext_po.md](./gettext_po.md) | | HTML Directory | HtmlDir | [html_dir.md](./html_dir.md) | | Glossary Info (.info) | Info | [info.md](./info.md) | -| IUPAC goldbook (.xml) | IUPACGoldbook | [iupac_goldbook.md](./iupac_goldbook.md) | | JMDict (xml) | JMDict | [jmdict.md](./jmdict.md) | | JMnedict | JMnedict | [jmnedict.md](./jmnedict.md) | | JSON (.json) | Json | [json.md](./json.md) | diff --git a/doc/p/iupac_goldbook.md b/doc/p/iupac_goldbook.md deleted file mode 100644 index e92a8c50c..000000000 --- a/doc/p/iupac_goldbook.md +++ /dev/null @@ -1,28 +0,0 @@ -## IUPAC goldbook (.xml) - -### General Information - -| Attribute | Value | -| --------------- | --------------------------- | -| Name | IUPACGoldbook | -| snake_case_name | iupac_goldbook | -| Description | IUPAC goldbook (.xml) | -| Extensions | | -| Read support | Yes | -| Write support | No | -| Single-file | Yes | -| Kind | 📝 text | -| Sort-on-write | default_no | -| Sort key | (`headword_lower`) | -| Wiki | ― | -| Website | https://goldbook.iupac.org/ | - -### Dependencies for reading - -PyPI Links: [lxml](https://pypi.org/project/lxml) - -To install, run: - -```sh -pip3 install lxml -``` diff --git a/doc/p/wiktextract.md b/doc/p/wiktextract.md index b723ddbd4..17cc17ac0 100644 --- a/doc/p/wiktextract.md +++ b/doc/p/wiktextract.md @@ -27,6 +27,7 @@ | example_padding | `10px 20px` | str | Padding for examples (css value) | | audio | `True` | bool | Enable audio | | audio_formats | `['ogg', 'mp3']` | list | List of audio formats to use | +| categories | `False` | bool | Enable categories | ### Dependencies for reading diff --git a/plugins-meta/index.json b/plugins-meta/index.json index c77264a2c..de3646dd5 100644 --- a/plugins-meta/index.json +++ b/plugins-meta/index.json @@ -1191,26 +1191,6 @@ "readOptions": {}, "writeOptions": {} }, - { - "module": "iupac_goldbook", - "lname": "iupac_goldbook", - "name": "IUPACGoldbook", - "description": "IUPAC goldbook (.xml)", - "extensions": [], - "singleFile": true, - "optionsProp": {}, - "canRead": true, - "canWrite": false, - "readOptions": {}, - "readDepends": { - "lxml": "lxml" - }, - "readCompressions": [ - "gz", - "bz2", - "lzma" - ] - }, { "module": "jmdict", "lname": "jmdict", @@ -1820,6 +1800,11 @@ "class": "ListOption", "type": "list", "comment": "List of audio formats to use" + }, + "categories": { + "class": "BoolOption", + "type": "bool", + "comment": "Enable categories" } }, "canRead": true, @@ -1833,7 +1818,8 @@ "audio_formats": [ "ogg", "mp3" - ] + ], + "categories": false }, "readDepends": { "lxml": "lxml" diff --git a/pyglossary/core.py b/pyglossary/core.py index 11aae5cda..ee3990042 100644 --- a/pyglossary/core.py +++ b/pyglossary/core.py @@ -22,9 +22,10 @@ def exc_note(e: Exception, note: str) -> Exception: try: - e.add_note(note) # # pyright: ignore[reportAttributeAccessIssue] + e.add_note(note) # pyright: ignore[reportAttributeAccessIssue] except AttributeError: - e.msg += "\n" + note # # pyright: ignore[reportAttributeAccessIssue] + if hasattr(e, "msg"): + e.msg += "\n" + note # pyright: ignore[reportAttributeAccessIssue] return e @@ -55,7 +56,7 @@ def exc_note(e: Exception, note: str) -> Exception: ] -VERSION = "5.0.0" +VERSION = "5.0.1" homePage = "https://github.com/ilius/pyglossary" diff --git a/pyglossary/ebook_base.py b/pyglossary/ebook_base.py index 3cf1e8c6e..dee1b2d94 100644 --- a/pyglossary/ebook_base.py +++ b/pyglossary/ebook_base.py @@ -289,6 +289,9 @@ def write_data_entry(self, entry: EntryType) -> None: "text/css", ) + def get_prefix(self, word: str) -> str: + raise NotImplementedError + def write_groups(self) -> Generator[None, EntryType, None]: # TODO: rtl=False option # TODO: handle alternates better (now shows word1|word2... in title) diff --git a/pyglossary/entry.py b/pyglossary/entry.py index 9304b5c9e..381f3f4e3 100644 --- a/pyglossary/entry.py +++ b/pyglossary/entry.py @@ -104,6 +104,10 @@ def s_word(self) -> str: def l_word(self) -> list[str]: return [self._fname] + @property + def lb_word(self) -> list[bytes]: + return [self._fname.encode("trf-8")] + @property def defi(self) -> str: return f"File: {self._fname}" @@ -194,7 +198,7 @@ def isData(cls) -> bool: @staticmethod def getRawEntrySortKey( - key: Callable[[bytes], Any], + key: Callable[[list[str]], Any], ) -> Callable[[RawEntryType], Any]: def newKey(x: RawEntryType) -> Any: # noqa: ANN401 # x is rawEntry, so x[2:] is list[bytes]: list of words in bytes diff --git a/pyglossary/entry_base.py b/pyglossary/entry_base.py index 29552a96c..15056b9fb 100644 --- a/pyglossary/entry_base.py +++ b/pyglossary/entry_base.py @@ -3,15 +3,17 @@ from __future__ import annotations import typing +from typing import TYPE_CHECKING -# from typing import TYPE_CHECKING +if TYPE_CHECKING: + from collections.abc import Callable __all__ = ["BaseEntry", "MultiStr"] MultiStr: typing.TypeAlias = "str | list[str]" -class BaseEntry: +class BaseEntry: # noqa: PLR0904 __slots__: list[str] = [ "_word", ] @@ -19,10 +21,33 @@ class BaseEntry: def __init__(self) -> None: self._word: str | list[str] + def isData(self) -> bool: ... + + def getFileName(self) -> str: + raise NotImplementedError + + @property + def data(self) -> bytes: + raise NotImplementedError + + def size(self) -> int: + raise NotImplementedError + + def save(self, directory: str) -> str: + raise NotImplementedError + @property def s_word(self) -> str: raise NotImplementedError + @property + def l_word(self) -> list[str]: + raise NotImplementedError + + @property + def lb_word(self) -> list[bytes]: + raise NotImplementedError + @property def defi(self) -> str: raise NotImplementedError @@ -36,3 +61,35 @@ def b_word(self) -> bytes: def b_defi(self) -> bytes: """Returns definition in bytes.""" return self.defi.encode("utf-8") + + @property + def defiFormat(self) -> str: + # TODO: type: Literal["m", "h", "x", "b"] + ... + + @defiFormat.setter + def defiFormat(self, defiFormat: str) -> None: + # TODO: type: Literal["m", "h", "x", "b"] + ... + + def detectDefiFormat(self, default: str = "") -> str: ... + + def addAlt(self, alt: str) -> None: ... + + def editFuncWord(self, func: Callable[[str], str]) -> None: ... + + def editFuncDefi(self, func: Callable[[str], str]) -> None: ... + + def strip(self) -> None: ... + + def replaceInWord(self, source: str, target: str) -> None: ... + + def replaceInDefi(self, source: str, target: str) -> None: ... + + def replace(self, source: str, target: str) -> None: ... + + def byteProgress(self) -> tuple[int, int] | None: ... + + def removeEmptyAndDuplicateAltWords(self) -> None: ... + + def stripFullHtml(self) -> str | None: ... diff --git a/pyglossary/glossary.py b/pyglossary/glossary.py index b4ea86ba8..f01c87642 100644 --- a/pyglossary/glossary.py +++ b/pyglossary/glossary.py @@ -20,7 +20,6 @@ from __future__ import annotations import warnings -from collections import OrderedDict as odict from os.path import relpath from time import perf_counter as now from typing import TYPE_CHECKING @@ -49,8 +48,8 @@ def __init__( ui: UIType | None = None, # noqa: F821 ) -> None: """ - info: OrderedDict or dict instance, or None - no need to copy OrderedDict instance before passing here + info: dict instance, or None + no need to copy dict instance before passing here we will not reference to it. """ warnings.warn( @@ -60,7 +59,7 @@ def __init__( ) GlossaryCommon.__init__(self, ui=ui) if info: - if not isinstance(info, dict | odict): + if not isinstance(info, dict): raise TypeError( "Glossary: `info` has invalid type" ", dict or OrderedDict expected", @@ -164,7 +163,11 @@ def sortWords( self._iter = self._loadedEntryGen() @classmethod - def detectInputFormat(cls, *args, **kwargs) -> DetectedFormat | None: # pyright: ignore[reportIncompatibleMethodOverride] + def detectInputFormat( # type: ignore # pyright: ignore[reportIncompatibleMethodOverride] + cls, + *args, + **kwargs, + ) -> DetectedFormat | None: try: return GlossaryCommon.detectInputFormat(*args, **kwargs) except Error as e: @@ -172,7 +175,11 @@ def detectInputFormat(cls, *args, **kwargs) -> DetectedFormat | None: # pyright return None @classmethod - def detectOutputFormat(cls, *args, **kwargs) -> DetectedFormat | None: # pyright: ignore[reportIncompatibleMethodOverride] + def detectOutputFormat( # type: ignore # pyright: ignore[reportIncompatibleMethodOverride] + cls, + *args, + **kwargs, + ) -> DetectedFormat | None: try: return GlossaryCommon.detectOutputFormat(*args, **kwargs) except Error as e: diff --git a/pyglossary/glossary_info.py b/pyglossary/glossary_info.py index 439d57708..94067328f 100644 --- a/pyglossary/glossary_info.py +++ b/pyglossary/glossary_info.py @@ -19,7 +19,6 @@ from __future__ import annotations import logging -from collections import OrderedDict as odict from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -45,7 +44,7 @@ class GlossaryInfo: def __init__(self) -> None: - self._info: dict[str, str] = odict() + self._info: dict[str, str] = {} def infoKeys(self) -> list[str]: return list(self._info) @@ -80,10 +79,10 @@ def setInfo(self, key: str, value: str | None) -> None: key = infoKeysAliasDict.get(key.lower(), key) self._info[key] = value - def getExtraInfos(self, excludeKeys: list[str]) -> odict: + def getExtraInfos(self, excludeKeys: list[str]) -> dict[str, str]: """ excludeKeys: a list of (basic) info keys to be excluded - returns an OrderedDict including the rest of info keys, + returns a dict including the rest of info keys, with associated values. """ excludeKeySet = set() @@ -93,7 +92,7 @@ def getExtraInfos(self, excludeKeys: list[str]) -> odict: if key2: excludeKeySet.add(key2) - extra = odict() + extra = {} for key, value in self._info.items(): if key in excludeKeySet: continue diff --git a/pyglossary/glossary_types.py b/pyglossary/glossary_types.py index 5caaafd86..3a5dcf466 100644 --- a/pyglossary/glossary_types.py +++ b/pyglossary/glossary_types.py @@ -14,7 +14,6 @@ ) if TYPE_CHECKING: - from collections import OrderedDict from typing import TypeAlias from .langs import Lang @@ -38,7 +37,7 @@ class EntryType(typing.Protocol): # noqa: PLR0904 - def __init__(self) -> None: ... + # def __init__(self) -> None: ... def isData(self) -> bool: ... @@ -157,7 +156,7 @@ def getInfo(self, key: str) -> str: ... def setInfo(self, key: str, value: str) -> None: ... - def getExtraInfos(self, excludeKeys: list[str]) -> OrderedDict: ... + def getExtraInfos(self, excludeKeys: list[str]) -> dict[str, str]: ... @property def author(self) -> str: ... @@ -225,6 +224,8 @@ def stripFullHtml( def preventDuplicateWords(self) -> None: ... + def mergeEntriesWithSameHeadwordPlaintext(self) -> None: ... + def removeHtmlTagsAll(self) -> None: ... def addCleanupPath(self, path: str) -> None: ... diff --git a/pyglossary/glossary_v2.py b/pyglossary/glossary_v2.py index 4280b0242..85e6b541c 100644 --- a/pyglossary/glossary_v2.py +++ b/pyglossary/glossary_v2.py @@ -22,7 +22,6 @@ import os import os.path import warnings -from collections import OrderedDict as odict from contextlib import suppress from dataclasses import dataclass from os.path import ( @@ -148,7 +147,7 @@ def _closeReaders(self) -> None: def initVars(self) -> None: GlossaryProgress.clear(self) - self._info = odict() + self._info = {} readers = getattr(self, "_readers", []) for reader in readers: @@ -188,8 +187,8 @@ def __init__( ui: UIType | None = None, # noqa: F821 ) -> None: """ - info: OrderedDict or dict instance, or None - no need to copy OrderedDict instance before passing here + info: dict instance, or None + no need to copy dict instance before passing here we will not reference to it. """ GlossaryInfo.__init__(self) @@ -206,7 +205,7 @@ def __init__( self.initVars() if info: - if not isinstance(info, dict | odict): + if not isinstance(info, dict): raise TypeError( "Glossary: `info` has invalid type" ", dict or OrderedDict expected", @@ -273,13 +272,13 @@ def _entryFromRaw(self, rawEntry: RawEntryType) -> EntryType: fname = rawEntry[2].decode("utf-8") if isinstance(fname, list): fname = fname[0] # NESTED 4 - return DataEntry(fname, tmpPath=defi) # pyright: ignore[reportReturnType] + return DataEntry(fname, tmpPath=defi) return Entry( [b.decode("utf-8") for b in rawEntry[2:]], defi, defiFormat=defiFormat, - ) # pyright: ignore[reportReturnType] + ) @property def rawEntryCompress(self) -> bool: @@ -430,7 +429,7 @@ def _loadedEntryGen(self) -> Iterator[EntryType]: for _entry in self._data: entry = _entry for f in filters: - entry = f.run(entry) # pyright: ignore[reportArgumentType] + entry = f.run(entry) # type: ignore # pyright: ignore[reportArgumentType] # assert entry # TODO: measure running time in non-optimized mode yield entry # pyright: ignore[reportReturnType] self.progressEnd() @@ -1089,7 +1088,7 @@ def _resolveSortParams( self._switchToSQLite( inputFilename=args.inputFilename, ) - else: + elif not os.getenv("NO_SQLITE"): self._data = self._newInMemorySqEntryList() self._data.setSortKey( diff --git a/pyglossary/info.py b/pyglossary/info.py index 7b25d3a44..82cbf03a5 100644 --- a/pyglossary/info.py +++ b/pyglossary/info.py @@ -31,6 +31,6 @@ ## # do not map "publisher" to "author" ## - # are there alternatives to "creationTime" - # and "lastUpdated"? + "date": "creationTime", + # are there alternatives to "lastUpdated"? } diff --git a/pyglossary/info_writer.py b/pyglossary/info_writer.py index ffc73deb6..d8c997b51 100644 --- a/pyglossary/info_writer.py +++ b/pyglossary/info_writer.py @@ -1,7 +1,7 @@ from __future__ import annotations from os.path import splitext -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from pyglossary.core import log from pyglossary.io_utils import nullTextIO @@ -33,7 +33,7 @@ def finish(self) -> None: def write(self) -> Generator[None, EntryType, None]: # noqa: PLR0912, C901 import re - from collections import Counter, OrderedDict + from collections import Counter from pyglossary.json_utils import dataToPrettyJson from pyglossary.langs.writing_system import getWritingSystemFromText @@ -105,9 +105,7 @@ def write(self) -> Generator[None, EntryType, None]: # noqa: PLR0912, C901 data_entry_count = defiFormatCounter["b"] del defiFormatCounter["b"] - info = OrderedDict() - for key, value in glos.iterInfo(): - info[key] = value + info: dict[str, Any] = dict(glos.iterInfo()) info["word_count"] = wordCount info["bword_count"] = bwordCount info["non_lowercase_word_count"] = nonLowercaseWordCount diff --git a/pyglossary/json_utils.py b/pyglossary/json_utils.py index fe34ce540..318edb9e8 100644 --- a/pyglossary/json_utils.py +++ b/pyglossary/json_utils.py @@ -1,16 +1,14 @@ from __future__ import annotations import json -from collections import OrderedDict from typing import TYPE_CHECKING if TYPE_CHECKING: from typing import AnyStr, TypeAlias -__all__ = ["dataToPrettyJson", "jsonToData", "jsonToOrderedData"] +__all__ = ["dataToPrettyJson", "jsonToData"] JsonEncodable: TypeAlias = dict | list -# OrderedDict is also subclass of Dict, issubclass(OrderedDict, Dict) is True def dataToPrettyJson( @@ -28,9 +26,3 @@ def dataToPrettyJson( def jsonToData(st: AnyStr) -> JsonEncodable: return json.loads(st) - - -def jsonToOrderedData(text: str) -> OrderedDict: - return json.JSONDecoder( - object_pairs_hook=OrderedDict, - ).decode(text) diff --git a/pyglossary/option.py b/pyglossary/option.py index a1d1a9336..cef9335a0 100644 --- a/pyglossary/option.py +++ b/pyglossary/option.py @@ -446,9 +446,7 @@ def toDict(self) -> dict[str, Any]: return data def groupValues(self) -> dict[str, Any] | None: - from collections import OrderedDict - - groups: dict[str, list[str]] = OrderedDict() + groups: dict[str, list[str]] = {} others: list[str] = [] for value in self.values or []: cats = self.re_category.findall(value) diff --git a/pyglossary/plugin_prop.py b/pyglossary/plugin_prop.py index da36af48e..1d8e61f19 100644 --- a/pyglossary/plugin_prop.py +++ b/pyglossary/plugin_prop.py @@ -20,7 +20,6 @@ import logging import warnings -from collections import OrderedDict as odict from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -331,10 +330,11 @@ def _getOptionAttrNamesFromClass(rwclass: type) -> list[str]: return nameList def _getOptionsFromClass(self, rwclass: type | None) -> dict[str, Any]: - optionsProp = self.optionsProp - options = odict() if rwclass is None: - return options + return {} + + optionsProp = self.optionsProp + options: dict[str, Any] = {} for attrName in self._getOptionAttrNamesFromClass(rwclass): name = attrName[1:] diff --git a/pyglossary/plugins/appledict/_content.py b/pyglossary/plugins/appledict/_content.py index 79e9f72a7..f29ceffa9 100644 --- a/pyglossary/plugins/appledict/_content.py +++ b/pyglossary/plugins/appledict/_content.py @@ -225,7 +225,7 @@ def prepare_content_with_soup( # noqa: PLR0912 tag.name = "del" if title and " AppleDictProperties: dictionaryIndexes: list[dict[str, Any]] | None = metadata.get( "IDXDictionaryIndexes", ) - if dictionaryIndexes: - key_text_metadata: dict[str, Any] = dictionaryIndexes[0] - body_metadata: dict[str, Any] = dictionaryIndexes[2] - else: - key_text_metadata: dict[str, Any] = {} - body_metadata: dict[str, Any] = {} + key_text_metadata: dict[str, Any] = ( + dictionaryIndexes[0] if dictionaryIndexes else {} + ) + body_metadata: dict[str, Any] = dictionaryIndexes[2] if dictionaryIndexes else {} key_text_data_fields = key_text_metadata.get("IDXIndexDataFields", {}) key_text_variable_fields = [ diff --git a/pyglossary/plugins/babylon_bgl/bgl_reader.py b/pyglossary/plugins/babylon_bgl/bgl_reader.py index 8c30c5a58..faea1ffa5 100644 --- a/pyglossary/plugins/babylon_bgl/bgl_reader.py +++ b/pyglossary/plugins/babylon_bgl/bgl_reader.py @@ -25,7 +25,6 @@ import io import os import re -from collections import OrderedDict as odict from typing import TYPE_CHECKING, NamedTuple from pyglossary.core import log @@ -346,7 +345,7 @@ class BglReader: def __init__(self, glos: GlossaryType) -> None: # no more arguments self._glos = glos self._filename = "" - self.info = odict() + self.info = {} self.numEntries = None #### self.sourceLang = "" diff --git a/pyglossary/plugins/crawler_dir.py b/pyglossary/plugins/crawler_dir.py index def850d76..9c0ec0557 100644 --- a/pyglossary/plugins/crawler_dir.py +++ b/pyglossary/plugins/crawler_dir.py @@ -90,8 +90,6 @@ def filePathFromWord(b_word: bytes) -> str: ) def write(self) -> None: - from collections import OrderedDict as odict - from pyglossary.json_utils import dataToPrettyJson filename = self._filename @@ -127,16 +125,11 @@ def write(self) -> None: mode="w", encoding="utf-8", ) as infoFile: - info = odict() + info = {} info["name"] = self._glos.getInfo("name") info["wordCount"] = wordCount - for key, value in self._glos.getExtraInfos( - ( - "name", - "wordCount", - ), - ).items(): - info[key] = value + info |= self._glos.getExtraInfos(["name", "wordCount"]) + infoFile.write(dataToPrettyJson(info)) @@ -147,12 +140,12 @@ def __init__(self, glos: GlossaryType) -> None: self._wordCount = 0 def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToOrderedData + from pyglossary.json_utils import jsonToData self._filename = filename with open(join(filename, "info.json"), encoding="utf-8") as infoFp: - info = jsonToOrderedData(infoFp.read()) + info = jsonToData(infoFp.read()) self._wordCount = info.pop("wordCount") for key, value in info.items(): self._glos.setInfo(key, value) diff --git a/pyglossary/plugins/dsl/_types.py b/pyglossary/plugins/dsl/_types.py index 62f849ba9..cc6c0f0a3 100644 --- a/pyglossary/plugins/dsl/_types.py +++ b/pyglossary/plugins/dsl/_types.py @@ -28,34 +28,36 @@ class TransformerType(typing.Protocol): def __init__(self) -> None: pass - def end(self) -> bool: - pass + def end(self) -> bool: ... - def move(self, chars: int) -> None: - pass + def move(self, chars: int) -> None: ... - def next(self) -> str: - pass + def next(self) -> str: ... - def follows(self, st: str) -> bool: - pass + def follows(self, st: str) -> bool: ... - def skipAny(self, chars: str) -> None: - pass + def skipAny(self, chars: str) -> None: ... - def addText(self, st: str) -> None: - pass + def addText(self, st: str) -> None: ... - def resetBuf(self) -> str: - pass + def resetBuf(self) -> None: ... + + def addHtml(self, st: str) -> None: ... + + def closeTag(self, tag: str) -> None: ... + + @property + def labelOpen(self) -> bool: ... + + @labelOpen.setter + def labelOpen(self) -> bool: ... class TitleTransformerType(TransformerType, typing.Protocol): title: str outputAlt: str - def addText2(self, st: str) -> None: - pass + def addText2(self, st: str) -> None: ... if TYPE_CHECKING: diff --git a/pyglossary/plugins/ebook_kobo.py b/pyglossary/plugins/ebook_kobo.py index 2a04fb8b9..02a108f88 100644 --- a/pyglossary/plugins/ebook_kobo.py +++ b/pyglossary/plugins/ebook_kobo.py @@ -157,7 +157,6 @@ def fix_defi(self, defi: str) -> str: def write_groups(self) -> Generator[None, EntryType, None]: import gzip - from collections import OrderedDict dataEntryCount = 0 @@ -192,7 +191,7 @@ def writeGroup(lastPrefix: str) -> None: continue l_word = entry.l_word allWords += l_word - wordsByPrefix: dict[str, list[str]] = OrderedDict() + wordsByPrefix: dict[str, list[str]] = {} for word in l_word: prefix = self.get_prefix(word) if prefix in wordsByPrefix: diff --git a/pyglossary/plugins/edict2/conv.py b/pyglossary/plugins/edict2/conv.py index 2f54dd6d4..5cd46eba4 100644 --- a/pyglossary/plugins/edict2/conv.py +++ b/pyglossary/plugins/edict2/conv.py @@ -88,7 +88,7 @@ def render_syllables_color( # @lru_cache(maxsize=128) def convert_pinyin(pinyin: str) -> tuple[Sequence[str], Sequence[str]]: - return tuple(zip(*map(convert, pinyin.split()), strict=False)) + return tuple(zip(*map(convert, pinyin.split()), strict=False)) # type: ignore def render_article( diff --git a/pyglossary/plugins/edlin.py b/pyglossary/plugins/edlin.py index 4d6ada1f7..fc5e428f8 100644 --- a/pyglossary/plugins/edlin.py +++ b/pyglossary/plugins/edlin.py @@ -98,7 +98,7 @@ def _clear(self) -> None: self._resFileNames: list[str] = [] def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToOrderedData + from pyglossary.json_utils import jsonToData if isdir(filename): infoFname = join(filename, "info.json") @@ -112,7 +112,7 @@ def open(self, filename: str) -> None: self._filename = filename with open(infoFname, encoding=self._encoding) as infoFp: - info = jsonToOrderedData(infoFp.read()) + info = jsonToData(infoFp.read()) self._wordCount = info.pop("wordCount") self._prev_link = info.pop("prev_link") self._rootPath = info.pop("root") @@ -271,8 +271,6 @@ def saveEntry( ) def write(self) -> Generator[None, EntryType, None]: - from collections import OrderedDict as odict - from pyglossary.json_utils import dataToPrettyJson thisEntry = yield @@ -302,21 +300,13 @@ def write(self) -> Generator[None, EntryType, None]: "w", encoding=self._encoding, ) as toFile: - info = odict() + info = {} info["name"] = self._glos.getInfo("name") info["root"] = self.hashToPath(rootHash) info["prev_link"] = self._prev_link info["wordCount"] = count # info["modified"] = - for key, value in self._glos.getExtraInfos( - ( - "name", - "root", - "prev_link", - "wordCount", - ), - ).items(): - info[key] = value + info |= self._glos.getExtraInfos(["name", "root", "prev_link", "wordCount"]) toFile.write(dataToPrettyJson(info)) diff --git a/pyglossary/plugins/freedict/reader.py b/pyglossary/plugins/freedict/reader.py index 978f86fb0..ef70b1844 100644 --- a/pyglossary/plugins/freedict/reader.py +++ b/pyglossary/plugins/freedict/reader.py @@ -30,7 +30,7 @@ NAMESPACE = {None: "http://www.tei-c.org/ns/1.0"} -@dataclass +@dataclass(slots=True) class ParsedSense: transCits: list[Element] defs: list[Element] @@ -152,9 +152,14 @@ def writeTransCit( ) -> None: from lxml import etree as ET + children = elem.xpath("child::node()") + if not children: + return + assert isinstance(children, list) + quotes: list[Element] = [] sense = ET.Element(f"{TEI}sense") - for child in elem.xpath("child::node()"): + for child in children: if isinstance(child, str): child = child.strip() # noqa: PLW2901 if child: @@ -214,9 +219,12 @@ def writeChild(item: str | Element, depth: int) -> None: return if item.tag == f"{TEI}ref": - if count > 0: - hf.write(self.getCommaSep(item.text)) - self.writeRef(hf, item) + if item.text: + if count > 0: + hf.write(self.getCommaSep(item.text)) + self.writeRef(hf, item) + else: + log.warning(f"ref without text: {self.tostring(item)}") return for child in item.xpath("child::node()"): diff --git a/pyglossary/plugins/gettext_po.py b/pyglossary/plugins/gettext_po.py index f6142d252..978b7c455 100644 --- a/pyglossary/plugins/gettext_po.py +++ b/pyglossary/plugins/gettext_po.py @@ -12,6 +12,7 @@ BoolOption, Option, ) +from pyglossary.text_utils import splitByBar if TYPE_CHECKING: import io @@ -60,6 +61,7 @@ class Reader: def __init__(self, glos: GlossaryType) -> None: self._glos = glos + self._alts = glos.alts self.clear() def clear(self) -> None: @@ -95,6 +97,11 @@ def __len__(self) -> int: ) return self._wordCount + def makeEntry(self, word: str, defi: str) -> EntryType: + if self._alts: + return self._glos.newEntry(splitByBar(word), defi) + return self._glos.newEntry(word, defi) + def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 try: from polib import unescape as po_unescape @@ -108,15 +115,15 @@ def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 defi = "" msgstr = False wordCount = 0 - for line in file: - line = line.strip() # noqa: PLW2901 + for line_ in file: + line = line_.strip() # noqa: PLW2901 if not line: continue if line.startswith("#"): continue if line.startswith("msgid "): if word: - yield self._glos.newEntry(word, defi) + yield self.makeEntry(word, defi) wordCount += 1 word = "" defi = "" @@ -125,18 +132,35 @@ def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 # TODO: parse defi and set glos info? # but this should be done in self.open word = po_unescape(line[6:]) + if word.startswith('"'): + if len(word) < 2 or word[-1] != '"': + raise ValueError("invalid po line: line") + word = word[1:-1] msgstr = False - elif line.startswith("msgstr "): + continue + if line.startswith("msgstr "): if msgstr: log.error("msgid omitted!") defi = po_unescape(line[7:]) + if defi.startswith('"'): + if len(defi) < 2 or defi[-1] != '"': + raise ValueError("invalid po line: line") + defi = defi[1:-1] msgstr = True - elif msgstr: - defi += po_unescape(line) + continue + + line = po_unescape(line) + if line.startswith('"'): + if len(line) < 2 or line[-1] != '"': + raise ValueError("invalid po line: line") + line = line[1:-1] + + if msgstr: + defi += line else: - word += po_unescape(line) + word += line if word: - yield self._glos.newEntry(word, defi) + yield self.makeEntry(word, defi) wordCount += 1 self._wordCount = wordCount @@ -152,13 +176,20 @@ def __init__(self, glos: GlossaryType) -> None: self._glos = glos self._filename = "" self._file: io.TextIOBase = nullTextIO + glos.preventDuplicateWords() def open(self, filename: str) -> None: + try: + from polib import escape as po_escape + except ModuleNotFoundError as e: + exc_note(e, f"Run `{pip} install polib` to install") + raise + self._filename = filename self._file = file = open(filename, mode="w", encoding="utf-8") file.write('#\nmsgid ""\nmsgstr ""\n') for key, value in self._glos.iterInfo(): - file.write(f'"{key}: {value}\\n"\n') + file.write(f'"{po_escape(key)}: {po_escape(value)}\\n"\n') def finish(self) -> None: self._filename = "" @@ -166,11 +197,7 @@ def finish(self) -> None: self._file = nullTextIO def write(self) -> Generator[None, EntryType, None]: - try: - from polib import escape as po_escape - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install polib` to install") - raise + from polib import escape as po_escape file = self._file @@ -185,6 +212,6 @@ def write(self) -> Generator[None, EntryType, None]: entry.save(filename + "_res") continue file.write( - f"msgid {po_escape(entry.s_word)}\n" - f"msgstr {po_escape(entry.defi)}\n\n", + f'msgid "{po_escape(entry.s_word)}"\n' + f'msgstr "{po_escape(entry.defi)}"\n\n', ) diff --git a/pyglossary/plugins/info_plugin.py b/pyglossary/plugins/info_plugin.py index 39f9fed62..8c4852ae0 100644 --- a/pyglossary/plugins/info_plugin.py +++ b/pyglossary/plugins/info_plugin.py @@ -54,10 +54,10 @@ def close(self) -> None: pass def open(self, filename: str) -> None: - from pyglossary.json_utils import jsonToOrderedData + from pyglossary.json_utils import jsonToData with open(filename, encoding="utf-8") as infoFp: - info = jsonToOrderedData(infoFp.read()) + info = jsonToData(infoFp.read()) for key, value in info.items(): self._glos.setInfo(key, value) diff --git a/pyglossary/plugins/iupac_goldbook.py b/pyglossary/plugins/iupac_goldbook.py deleted file mode 100644 index 18d79ed5a..000000000 --- a/pyglossary/plugins/iupac_goldbook.py +++ /dev/null @@ -1,315 +0,0 @@ -# -*- coding: utf-8 -*- -# mypy: ignore-errors -from __future__ import annotations - -from io import BytesIO -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from collections.abc import Iterator - - from pyglossary.glossary_types import ( - EntryType, - GlossaryType, - ) - from pyglossary.lxml_types import Element - from pyglossary.option import Option -from pyglossary.compression import ( - compressionOpen, - stdCompressions, -) -from pyglossary.core import exc_note, log, pip -from pyglossary.html_utils import unescape_unicode - -__all__ = [ - "Reader", - "description", - "enable", - "extensionCreate", - "extensions", - "kind", - "lname", - "name", - "optionsProp", - "singleFile", - "website", - "wiki", -] - -enable = True -lname = "iupac_goldbook" -name = "IUPACGoldbook" -description = "IUPAC goldbook (.xml)" -extensions = () -extensionCreate = ".xml" -singleFile = True -kind = "text" -wiki = "" -website = "https://goldbook.iupac.org/" -optionsProp: dict[str, Option] = {} - - -class Reader: - compressions = stdCompressions - depends = { - "lxml": "lxml", - } - - def __init__(self, glos: GlossaryType) -> None: - self._glos = glos - self._filename = "" - self._file = None - self._fileSize = 0 - self._termByCode: dict[str, str] = {} - - def __len__(self) -> int: - return 0 - - def close(self) -> None: - if self._file: - self._file.close() - self._file = None - self._filename = "" - self._fileSize = 0 - self._termByCode = {} - - def open(self, filename: str) -> None: - try: - from lxml import etree as ET - except ModuleNotFoundError as e: - exc_note(e, f"Run `{pip} install lxml` to install") - raise - - self._filename = filename - file = compressionOpen(filename, mode="rb") - file.seek(0, 2) - self._fileSize = file.tell() - file.seek(0) - - chunk = file.read(800) - chunk_end = chunk.find(b"") - chunk = chunk[:chunk_end] - chunk += b"" - - infoRoot = ET.fromstring(chunk) - self.setMetadata(infoRoot) - - file.seek(0) - context = ET.iterparse( - file, - events=("end",), - tag="entry", - ) - termByCode: dict[str, str] = {} - for _, elem in context: - termE = elem.find("./term") - if termE is None: - continue - term = self.getTerm(termE) - codeE = elem.find("./code") - if codeE is None: - continue - termByCode[codeE.text] = term - self._termByCode = termByCode - - file.close() - - def setGlosInfo(self, key: str, value: str) -> None: - if value is None: - return - self._glos.setInfo(key, unescape_unicode(value)) - - def setMetadata(self, header: Element) -> None: - if header is None: - return - - title = header.find("./title") - if title: - self.setGlosInfo("name", title.text) - - publisher = header.find("./publisher") - if publisher: - self.setGlosInfo("publisher", publisher.text) - - isbn = header.find("./isbn") - if isbn: - self.setGlosInfo("isbn", isbn.text) - - doi = header.find("./doi") - if doi: - self.setGlosInfo("doi", doi.text) - - accessdate = header.find("./accessdate") - if accessdate: - self.setGlosInfo("creationTime", accessdate.text) - - @staticmethod - def tostring( - elem: Element, - ) -> str: - from lxml import etree as ET - - return ( - ET.tostring( - elem, - method="html", - pretty_print=True, - ) - .decode("utf-8") - .strip() - ) - - @staticmethod - def innerXML(elem: Element) -> str: - from lxml import etree as ET - - elemName = elem.xpath("name(/*)") - resultStr = "" - for e in elem.xpath("/" + elemName + "/node()"): - if isinstance(e, str): - pass # resultStr += e - else: - resultStr += ET.tostring(e, encoding="unicode") - - return resultStr - - def getTerm(self, termE: Element) -> str: # noqa: PLR6301 - from lxml import etree as ET - - term = ( - ET.tostring( - termE, - method="html", - pretty_print=False, - ) - .decode("utf-8") - .strip()[6:-7] - .strip() - ) - term = unescape_unicode(term) - term = term.replace("", "").replace("", "") - return term # noqa: RET504 - - def __iter__(self) -> Iterator[EntryType]: # noqa: PLR0912 - from lxml import etree as ET - - glos = self._glos - fileSize = self._fileSize - termByCode = self._termByCode - - self._file = file = compressionOpen(self._filename, mode="rb") - context = ET.iterparse( - self._file, - events=("end",), - tag="entry", - ) - for _, elem in context: # noqa: PLR1702 - codeE = elem.find("./code") - if codeE is None: - continue - code = codeE.text - - id_ = elem.attrib.get("id") - termE = elem.find("./term") - if termE is None: - log.warning(f"no term, {code=}, {id_=}") - continue - - term = self.getTerm(termE) - - words: list[str] = [] - if term: - words.append(term) - if code: - words.append(code) - - # if _id is not None: - # words.append(f"id{_id}") - - identifierTerm = elem.find("./identifiers/term") - if identifierTerm is not None and identifierTerm.text: - words.append(identifierTerm.text) - - identifierSynonym = elem.find("./identifiers/synonym") - if identifierSynonym is not None and identifierSynonym.text: - words.append(identifierSynonym.text) - - defiParts: list[str] = [] - - definition = elem.find("./definition") - if definition is None or not definition.text: - pass - else: - defiParts.append(definition.text) - - definitionEntryList = elem.findall("./definition/entry") - if definitionEntryList: - bio = BytesIO() - with ET.htmlfile(bio, encoding="utf-8") as hf: - with hf.element("ol"): - for item in definitionEntryList: - if not item.text: - continue - with hf.element("li"): - hf.write(item.text) - listHtml = bio.getvalue().decode("utf-8") - defiParts.append(listHtml) - - replacedbyE = elem.find("./replacedby") - if replacedbyE is not None: - replacedby = replacedbyE.text - replacedbyCode = replacedby.split(".")[-1] - replacedbyTerm = termByCode.get(replacedbyCode) - if replacedbyTerm is None: - log.warning(f"{term}: {replacedby=}") - replacedbyTerm = replacedbyCode - defiParts.append( - f'Replaced by: {replacedbyTerm}', - ) - - relatedList = elem.findall("./related/entry") - if relatedList: - relatedLinkList: list[str] = [] - for related in relatedList: - relatedURL = related.text - relatedCode = relatedURL.split("/")[-1] - relatedTerm = termByCode.get(relatedCode) - if not relatedTerm: - log.warning(f"{term}: {relatedURL=}") - relatedTerm = relatedCode - relatedLinkList.append( - f'{relatedTerm}', - ) - defiParts.append("Related: " + ", ".join(relatedLinkList)) - - lastupdatedE = elem.find("./lastupdated") - if lastupdatedE is not None: - defiParts.append(f"Last updated: {lastupdatedE.text}") - - urlE = elem.find("./url") - if urlE is not None: - defiParts.append(f'More info.') - - if len(defiParts) > 1: - defiParts.insert(1, "") - - try: - defi = "
".join(defiParts) - except Exception: - log.error(f"{defiParts = }") - continue - - yield glos.newEntry( - words, - defi, - defiFormat="h", - byteProgress=(file.tell(), fileSize), - ) - - # clean up preceding siblings to save memory - # this can reduce memory usage from >300 MB to ~25 MB - parent = elem.getparent() - if parent is None: - continue - while elem.getprevious() is not None: - del parent[0] diff --git a/pyglossary/plugins/quickdic6/quickdic.py b/pyglossary/plugins/quickdic6/quickdic.py index e4efdb84e..aeb6e9c94 100644 --- a/pyglossary/plugins/quickdic6/quickdic.py +++ b/pyglossary/plugins/quickdic6/quickdic.py @@ -94,6 +94,7 @@ def add_index( # noqa: PLR0913 index_entries: list[IndexEntryType] = [] for token, token_norm, ttype, tidx in tokens: prev_token = index_entries[-1][0] if index_entries else "" + html_indices: list[int] if prev_token == token: ( token, # noqa: PLW2901 @@ -107,7 +108,7 @@ def add_index( # noqa: PLR0913 index_start = len(rows) count = 0 token_norm = "" if token == token_norm else token_norm # noqa: PLW2901 - html_indices: list[int] = [] + html_indices = [] rows.append((1, i_entry)) if ttype == 4: if tidx not in html_indices: diff --git a/pyglossary/plugins/stardict/reader.py b/pyglossary/plugins/stardict/reader.py index c942b55bf..2a9f21190 100644 --- a/pyglossary/plugins/stardict/reader.py +++ b/pyglossary/plugins/stardict/reader.py @@ -281,11 +281,10 @@ def renderRawDefiList( defiFormatSet.add(format_) if len(defiFormatSet) == 1: - defis = [_defi for _defi, _ in defisWithFormat] format_ = defiFormatSet.pop() if format_ == "h": - return "\n
".join(defis), format_ - return "\n".join(defis), format_ + return "\n
".join([defi for defi, _ in defisWithFormat]), format_ + return "\n".join([defi for defi, _ in defisWithFormat]), format_ if not defiFormatSet: log.error(f"empty defiFormatSet, {rawDefiList=}") diff --git a/pyglossary/plugins/stardict/writer.py b/pyglossary/plugins/stardict/writer.py index 79723c625..acd9a9b19 100644 --- a/pyglossary/plugins/stardict/writer.py +++ b/pyglossary/plugins/stardict/writer.py @@ -3,7 +3,6 @@ import os import re -from collections import OrderedDict from os.path import ( dirname, getsize, @@ -387,14 +386,12 @@ def writeIfoFile( defiFormat = self._sametypesequence indexFileSize = getsize(self._filename + ".idx") - ifoDict: dict[str, str] = OrderedDict( - { - "version": "3.0.0", - "bookname": self.getBookname(), - "wordcount": str(wordCount), - "idxfilesize": str(indexFileSize), - } - ) + ifoDict: dict[str, str] = { + "version": "3.0.0", + "bookname": self.getBookname(), + "wordcount": str(wordCount), + "idxfilesize": str(indexFileSize), + } if self._large_file: ifoDict["idxoffsetbits"] = "64" diff --git a/pyglossary/plugins/stardict_textual.py b/pyglossary/plugins/stardict_textual.py index 60d30a60b..a54d04266 100644 --- a/pyglossary/plugins/stardict_textual.py +++ b/pyglossary/plugins/stardict_textual.py @@ -180,22 +180,20 @@ def renderDefiList( defiFormatSet.update(_type for _, _type in defisWithFormat) if len(defiFormatSet) == 1: - defis = [_defi for _defi, _ in defisWithFormat] format_ = defiFormatSet.pop() if format_ == "h": - return "\n
".join(defis), format_ - return "\n".join(defis), format_ + return "\n
".join([defi for defi, _ in defisWithFormat]), format_ + return "\n".join([defi for defi, _ in defisWithFormat]), format_ # convert plaintext or xdxf to html defis: list[str] = [] for defi_, format_ in defisWithFormat: - defi = defi_ if format_ == "m": - defi = defi.replace("\n", "
") - defi = f"
{defi}
" + defis.append("
" + defi_.replace("\n", "
") + "
") elif format_ == "x": - defi = self.xdxf_transform(defi) - defis.append(defi) + defis.append(self.xdxf_transform(defi_)) + else: + defis.append(defi_) return "\n
\n".join(defis), "h" def __iter__(self) -> Iterator[EntryType]: @@ -213,7 +211,7 @@ def __iter__(self) -> Iterator[EntryType]: elem = cast("Element", _elem) words: list[str] = [] defisWithFormat: list[tuple[str, str]] = [] - for child in elem.getchildren(): + for child in elem.iterchildren(): if not child.text: continue if child.tag in {"key", "synonym"}: diff --git a/pyglossary/plugins/wiktextract.py b/pyglossary/plugins/wiktextract.py index 9f7e6734c..9fa987bb6 100644 --- a/pyglossary/plugins/wiktextract.py +++ b/pyglossary/plugins/wiktextract.py @@ -2,6 +2,7 @@ from __future__ import annotations import collections +from collections import Counter from io import BytesIO, IOBase from json import loads as json_loads from typing import TYPE_CHECKING, cast @@ -77,6 +78,9 @@ "audio_formats": ListOption( comment="List of audio formats to use", ), + "categories": BoolOption( + comment="Enable categories", + ), } @@ -97,6 +101,8 @@ class Reader: _audio_formats: list[str] = ["ogg", "mp3"] + _categories: bool = False + topicStyle = ( "color:white;" "background:green;" @@ -140,7 +146,7 @@ def open( self._glos.setInfo("definition_has_headwords", "True") self._file = cfile - self._warnings = collections.Counter() + self._warnings: Counter[str] = collections.Counter() def close(self) -> None: self._file.close() @@ -166,7 +172,7 @@ def __iter__(self) -> Iterator[EntryType]: def warning(self, msg: str) -> None: self._warnings[msg] += 1 - def makeEntry(self, data: dict[str, Any]) -> EntryType: + def makeEntry(self, data: dict[str, Any]) -> EntryType: # noqa: PLR0912 from lxml import etree as ET glos = self._glos @@ -216,7 +222,9 @@ def br() -> Element: with hf.element("font", color=self._gram_color): hf.write(pos) - self.writeSenseList(hf_, data.get("senses")) # type: ignore + senses = data.get("senses") or [] + + self.writeSenseList(hf_, senses) # type: ignore self.writeSynonyms(hf_, data.get("synonyms")) # type: ignore @@ -228,9 +236,18 @@ def br() -> Element: etymology: str = data.get("etymology_text", "") if etymology: + hf.write(br()) with hf.element("div"): hf.write(f"Etymology: {etymology}") + if self._categories: + categories = [] + for sense in senses: + senseCats = sense.get("categories") + if senseCats: + categories += senseCats + self.writeSenseCategories(hf_, categories) + defi = f.getvalue().decode("utf-8") # defi = defi.replace("\xa0", " ") # do we need to do this? file = self._file @@ -366,12 +383,13 @@ def writeSenseCategories( def writeSenseExample( # noqa: PLR6301, PLR0912 self, hf: T_htmlfile, - example: dict[str, str], + example: dict[str, str | list], ) -> None: # example keys: text, "english", "ref", "type" - textList: list[tuple[str, str]] = [] - text_ = example.pop("example", "") + textList: list[tuple[str | None, str]] = [] + text_: str | list = example.pop("example", "") if text_: + assert isinstance(text_, str) textList.append((None, text_)) example.pop("ref", "") @@ -380,7 +398,7 @@ def writeSenseExample( # noqa: PLR6301, PLR0912 for key, value in example.items(): if not value: continue - prefix = key + prefix: str | None = key if prefix in ("text",): # noqa: PLR6201, FURB171 prefix = None if isinstance(value, str): @@ -388,7 +406,7 @@ def writeSenseExample( # noqa: PLR6301, PLR0912 elif isinstance(value, list): for item in value: if isinstance(item, str): - textList.append((prefix, value)) + textList.append((prefix, item)) elif isinstance(item, list): textList += [(prefix, item2) for item2 in item] else: @@ -397,7 +415,7 @@ def writeSenseExample( # noqa: PLR6301, PLR0912 if not textList: return - def writePair(prefix: str, text: str) -> None: + def writePair(prefix: str | None, text: str) -> None: if prefix: with hf.element("b"): hf.write(prefix) @@ -417,7 +435,7 @@ def writePair(prefix: str, text: str) -> None: def writeSenseExamples( self, hf: T_htmlfile, - examples: list[dict[str, str]] | None, + examples: list[dict[str, str | list]] | None, ) -> None: from lxml import etree as ET @@ -643,8 +661,6 @@ def writeSense( if glosses: self.makeList(hf, glosses, self.writeSenseGloss) - self.writeSenseCategories(hf, sense.get("categories")) - self.writeTopics(hf, sense.get("topics")) self.writeSenseFormOfList(hf, sense.get("form_of")) @@ -676,24 +692,24 @@ def makeList( # noqa: PLR0913 hf: T_htmlfile, input_objects: list[Any], processor: Callable, - single_prefix: str = "", - skip_single: bool = True, ordered: bool = True, - list_type: str = "", + skip_single: bool = True, + # single_prefix: str = "", + # list_type: str = "", ) -> None: """Wrap elements into
    if more than one element.""" if not input_objects: return if skip_single and len(input_objects) == 1: - if single_prefix: - hf.write(single_prefix) + # if single_prefix: + # hf.write(single_prefix) processor(hf, input_objects[0]) return attrib: dict[str, str] = {} - if list_type: - attrib["type"] = list_type + # if list_type: + # attrib["type"] = list_type with hf.element("ol" if ordered else "ul", attrib=attrib): for el in input_objects: diff --git a/pyglossary/plugins/wordset.py b/pyglossary/plugins/wordset.py index e764ccb79..f2a0ce8b8 100644 --- a/pyglossary/plugins/wordset.py +++ b/pyglossary/plugins/wordset.py @@ -11,10 +11,10 @@ EncodingOption, Option, ) -from pyglossary.sort_keys import lookupSortKey if TYPE_CHECKING: from collections.abc import Iterator + from typing import Any from pyglossary.glossary_types import EntryType, GlossaryType @@ -98,6 +98,10 @@ def fileNameSortKey(fname: str) -> str: return "\x80" return fname + @staticmethod + def sortKey(word: str) -> Any: + return word.lower().encode("utf-8", errors="replace") + def __iter__(self) -> Iterator[EntryType]: if not self._filename: raise RuntimeError("iterating over a reader while it's not open") @@ -105,19 +109,14 @@ def __iter__(self) -> Iterator[EntryType]: direc = self._filename encoding = self._encoding glos = self._glos + for fname in sorted(listdir(direc), key=self.fileNameSortKey): fpath = join(direc, fname) if not (fname.endswith(".json") and isfile(fpath)): continue with open(fpath, encoding=encoding) as fileObj: - data = load(fileObj) - words = list(data) - namedSortKey = lookupSortKey("headword_lower") - if namedSortKey is None: - raise RuntimeError("namedSortKey is None") - sortKey = namedSortKey.normal("utf-8") - words.sort(key=sortKey) - for word in words: + data: dict[str, dict[str, Any]] = load(fileObj) + for word in sorted(data, key=self.sortKey): entryDict = data[word] defi = "".join( self.defiTemplate.format( diff --git a/pyglossary/sq_entry_list.py b/pyglossary/sq_entry_list.py index cd3402b7e..c6d4381dc 100644 --- a/pyglossary/sq_entry_list.py +++ b/pyglossary/sq_entry_list.py @@ -29,6 +29,7 @@ from .glossary_types import EntryType, RawEntryType from .sort_keys import NamedSortKey + from .sort_keys_types import SQLiteSortKeyType __all__ = ["SqEntryList"] @@ -62,7 +63,7 @@ def __init__( # noqa: PLR0913 self._reverse = False self._len = 0 self._create = create - self._sqliteSortKey = None + self._sqliteSortKey: SQLiteSortKeyType = [] self._columnNames = "" def hasSortKey(self) -> bool: @@ -78,7 +79,7 @@ def setSortKey( if self._con is None: raise RuntimeError("self._con is None") - if self._sqliteSortKey is not None: + if self._sqliteSortKey: raise RuntimeError("Called setSortKey twice") if namedSortKey.sqlite is None: @@ -116,7 +117,7 @@ def _decode(self, data: bytes) -> EntryType: return self._entryFromRaw(data.split(b"\x00")) def append(self, entry: EntryType) -> None: - self._cur.execute( + self._cur.execute( # type: ignore f"insert into data({self._columnNames}, data)" f" values (?{', ?' * len(self._sqliteSortKey)})", [col[2](entry.l_word) for col in self._sqliteSortKey] @@ -139,8 +140,8 @@ def __iadd__(self, other: Iterable) -> SqEntryList: def sort(self, reverse: bool = False) -> None: if self._sorted: raise NotImplementedError("can not sort more than once") - if self._sqliteSortKey is None: - raise RuntimeError("self._sqliteSortKey is None") + if not self._sqliteSortKey: + raise RuntimeError("self._sqliteSortKey is empty") self._reverse = reverse self._sorted = True @@ -148,6 +149,7 @@ def sort(self, reverse: bool = False) -> None: self._orderBy = sortColumnNames if reverse: self._orderBy = ",".join(f"{col[0]} DESC" for col in self._sqliteSortKey) + assert self._con self._con.commit() self._con.execute( f"CREATE INDEX sortkey ON data({sortColumnNames});", diff --git a/pyglossary/text_reader.py b/pyglossary/text_reader.py index 249212e1b..1f63c3b7d 100644 --- a/pyglossary/text_reader.py +++ b/pyglossary/text_reader.py @@ -123,7 +123,7 @@ def _openGen(self, filename: str) -> Iterator[tuple[int, int]]: else: log.warning("TextGlossaryReader: file is not seekable") - self._progress = self._glos.progressbar and self._fileSize + self._progress = self._glos.progressbar and self._fileSize > 0 self._file = TextFilePosWrapper(cfile, self._encoding) if self._hasInfo: diff --git a/pyglossary/text_utils.py b/pyglossary/text_utils.py index 816dbc44e..3ef25fe3e 100644 --- a/pyglossary/text_utils.py +++ b/pyglossary/text_utils.py @@ -57,7 +57,7 @@ def toStr(s: AnyStr) -> str: def fixUtf8(st: AnyStr) -> str: if isinstance(st, str): - st = bytes(st, "utf-8") + return st.encode("utf-8").replace(b"\x00", b"").decode("utf-8", "replace") return st.replace(b"\x00", b"").decode("utf-8", "replace") diff --git a/pyglossary/ui/base.py b/pyglossary/ui/base.py index 3126487b9..090e68d67 100644 --- a/pyglossary/ui/base.py +++ b/pyglossary/ui/base.py @@ -21,7 +21,6 @@ from __future__ import annotations import logging -from collections import OrderedDict from os.path import isfile, join from pyglossary.core import ( @@ -65,7 +64,7 @@ def fread(path: str) -> str: } -def getEntryFilterConfigPair(name: str) -> tuple[str, Option]: +def getEntryFilterOption(name: str) -> Option: filterClass, default = _entryFilterConfigDict[name] if isinstance(default, bool): optClass = BoolOption @@ -73,7 +72,7 @@ def getEntryFilterConfigPair(name: str) -> tuple[str, Option]: optClass = StrOption else: raise TypeError(f"{default = }") - return name, optClass( + return optClass( hasFlag=True, comment=filterClass.desc, falseComment=filterClass.falseComment, @@ -81,120 +80,83 @@ def getEntryFilterConfigPair(name: str) -> tuple[str, Option]: class UIBase: - configDefDict: dict[str, Option] = OrderedDict( - [ - ( - "log_time", - BoolOption( - hasFlag=True, - comment="Show date and time in logs", - falseComment="Do not show date and time in logs", - ), + configDefDict: dict[str, Option] = { + "log_time": BoolOption( + hasFlag=True, + comment="Show date and time in logs", + falseComment="Do not show date and time in logs", + ), + "cleanup": BoolOption( + hasFlag=True, + comment="Cleanup cache or temporary files after conversion", + falseComment=("Do not cleanup cache or temporary files after conversion",), + ), + "auto_sqlite": BoolOption( + hasFlag=False, + comment=( + "Auto-enable --sqlite to limit RAM usage when direct\n" + "mode is not possible. Can override with --no-sqlite" ), - ( - "cleanup", - BoolOption( - hasFlag=True, - comment="Cleanup cache or temporary files after conversion", - falseComment=( - "Do not cleanup cache or temporary files after conversion", - ), - ), - ), - ( - "auto_sqlite", - BoolOption( - hasFlag=False, - comment=( - "Auto-enable --sqlite to limit RAM usage when direct\n" - "mode is not possible. Can override with --no-sqlite" - ), - ), - ), - ( - "enable_alts", - BoolOption( - hasFlag=True, - customFlag="alts", - comment="Enable alternates", - falseComment="Disable alternates", - ), - ), - # FIXME: replace with "resources" - # comment="Use resources (images, audio, etc)" - ( - "skip_resources", - BoolOption( - hasFlag=True, - comment="Skip resources (images, audio, css, etc)", - ), - ), - ( - "save_info_json", - BoolOption( - hasFlag=True, - customFlag="info", - comment="Save .info file alongside output file(s)", - ), - ), - getEntryFilterConfigPair("lower"), - getEntryFilterConfigPair("utf8_check"), - getEntryFilterConfigPair("rtl"), - getEntryFilterConfigPair("remove_html"), - getEntryFilterConfigPair("remove_html_all"), - getEntryFilterConfigPair("normalize_html"), - getEntryFilterConfigPair("skip_duplicate_headword"), - getEntryFilterConfigPair("trim_arabic_diacritics"), - getEntryFilterConfigPair("unescape_word_links"), - ( - "color.enable.cmd.unix", - BoolOption( - hasFlag=False, - comment="Enable colors in Linux/Unix command line", - ), - ), - ( - "color.enable.cmd.windows", - BoolOption( - hasFlag=False, - comment="Enable colors in Windows command line", - ), - ), - ( - "color.cmd.critical", - IntOption( - hasFlag=False, - comment="Color code for critical errors in command line", - ), - ), - ( - "color.cmd.error", - IntOption( - hasFlag=False, - comment="Color code for errors in command line", - ), - ), - ( - "color.cmd.warning", - IntOption( - hasFlag=False, - comment="Color code for warnings in command line", - ), - ), - # interactive command line interface - ("cmdi.prompt.indent.str", StrOption(hasFlag=False)), - ("cmdi.prompt.indent.color", IntOption(hasFlag=False)), - ("cmdi.prompt.msg.color", IntOption(hasFlag=False)), - ("cmdi.msg.color", IntOption(hasFlag=False)), - ("ui_autoSetFormat", BoolOption(hasFlag=False)), - ("reverse_matchWord", BoolOption(hasFlag=False)), - ("reverse_showRel", StrOption(hasFlag=False)), - ("reverse_saveStep", IntOption(hasFlag=False)), - ("reverse_minRel", FloatOption(hasFlag=False)), - ("reverse_maxNum", IntOption(hasFlag=False)), - ("reverse_includeDefs", BoolOption(hasFlag=False)), - ], - ) + ), + "enable_alts": BoolOption( + hasFlag=True, + customFlag="alts", + comment="Enable alternates", + falseComment="Disable alternates", + ), + # FIXME: replace with "resources" + # comment="Use resources (images, audio, etc)" + "skip_resources": BoolOption( + hasFlag=True, + comment="Skip resources (images, audio, css, etc)", + ), + "save_info_json": BoolOption( + hasFlag=True, + customFlag="info", + comment="Save .info file alongside output file(s)", + ), + "lower": getEntryFilterOption("lower"), + "utf8_check": getEntryFilterOption("utf8_check"), + "rtl": getEntryFilterOption("rtl"), + "remove_html": getEntryFilterOption("remove_html"), + "remove_html_all": getEntryFilterOption("remove_html_all"), + "normalize_html": getEntryFilterOption("normalize_html"), + "skip_duplicate_headword": getEntryFilterOption("skip_duplicate_headword"), + "trim_arabic_diacritics": getEntryFilterOption("trim_arabic_diacritics"), + "unescape_word_links": getEntryFilterOption("unescape_word_links"), + "color.enable.cmd.unix": BoolOption( + hasFlag=False, + comment="Enable colors in Linux/Unix command line", + ), + "color.enable.cmd.windows": BoolOption( + hasFlag=False, + comment="Enable colors in Windows command line", + ), + "color.cmd.critical": IntOption( + hasFlag=False, + comment="Color code for critical errors in command line", + ), + "color.cmd.error": IntOption( + hasFlag=False, + comment="Color code for errors in command line", + ), + "color.cmd.warning": IntOption( + hasFlag=False, + comment="Color code for warnings in command line", + ), + # interactive command line interface: + "cmdi.prompt.indent.str": StrOption(hasFlag=False), + "cmdi.prompt.indent.color": IntOption(hasFlag=False), + "cmdi.prompt.msg.color": IntOption(hasFlag=False), + "cmdi.msg.color": IntOption(hasFlag=False), + "ui_autoSetFormat": BoolOption(hasFlag=False), + "reverse_matchWord": BoolOption(hasFlag=False), + "reverse_showRel": StrOption(hasFlag=False), + "reverse_saveStep": IntOption(hasFlag=False), + "reverse_minRel": FloatOption(hasFlag=False), + "reverse_maxNum": IntOption(hasFlag=False), + "reverse_includeDefs": BoolOption(hasFlag=False), + } conflictingParams = [ ("sqlite", "direct"), @@ -254,7 +216,7 @@ def loadConfig( def saveConfig(self) -> None: from pyglossary.json_utils import dataToPrettyJson - config = OrderedDict() + config = {} for key, option in self.configDefDict.items(): if key not in self.config: log.warning(f"saveConfig: missing key {key!r}") diff --git a/pyglossary/ui/runner.py b/pyglossary/ui/runner.py index ce1365e01..06554b793 100644 --- a/pyglossary/ui/runner.py +++ b/pyglossary/ui/runner.py @@ -12,13 +12,14 @@ import argparse import logging from collections.abc import Callable + from typing import Any ui_list = ["gtk", "gtk4", "tk", "web"] if os.sep == "\\": ui_list = ["tk", "gtk", "gtk4", "web"] -log = None +log: logging.Logger | None = None def canRunGUI() -> bool: @@ -48,14 +49,15 @@ def base_ui_run( # noqa: PLR0913 inputFormat: str = "", outputFormat: str = "", reverse: bool = False, - config: dict | None = None, - readOptions: dict | None = None, - writeOptions: dict | None = None, - convertOptions: dict | None = None, - glossarySetAttrs: dict | None = None, + config: dict[str, Any] | None = None, + readOptions: dict[str, Any] | None = None, + writeOptions: dict[str, Any] | None = None, + convertOptions: dict[str, Any] | None = None, + glossarySetAttrs: dict[str, Any] | None = None, ) -> bool: from pyglossary.glossary_v2 import ConvertArgs, Glossary + assert log if reverse: log.error("--reverse does not work with --ui=none") return False diff --git a/pyglossary/ui/tools/view_glossary.py b/pyglossary/ui/tools/view_glossary.py index 5c1b90133..349c45d8c 100755 --- a/pyglossary/ui/tools/view_glossary.py +++ b/pyglossary/ui/tools/view_glossary.py @@ -2,6 +2,7 @@ # mypy: ignore-errors from __future__ import annotations +import argparse import os.path import shlex import sys @@ -59,6 +60,7 @@ def viewGlossary( filename: str, formatName: str | None = None, glos: GlossaryType | None = None, + noRes: bool = False, ) -> None: highlightEntry = getEntryHighlighter() @@ -81,6 +83,8 @@ def viewGlossary( def handleEntry(entry: EntryType) -> None: nonlocal index + if noRes and entry.isData(): + return if highlightEntry: highlightEntry(entry) entryStr = ( @@ -110,12 +114,37 @@ def handleEntry(entry: EntryType) -> None: def main() -> None: - filename = sys.argv[1] - formatName = None - if len(sys.argv) > 2: - formatName = sys.argv[2] - filename = os.path.expanduser(filename) - viewGlossary(filename, formatName=formatName) + parser = argparse.ArgumentParser( + prog=sys.argv[0], + add_help=True, + # allow_abbrev=False, + ) + parser.add_argument( + "--format", + dest="formatName", + default=None, + help="format name", + ) + parser.add_argument( + "--no-res", + dest="noRes", + action="store_true", + default=False, + help="do not automatically show resources / files", + ) + parser.add_argument( + "filename", + action="store", + default="", + nargs=1, + ) + args = parser.parse_args() + + viewGlossary( + os.path.expanduser(args.filename[0]), + formatName=args.formatName, + noRes=args.noRes, + ) if __name__ == "__main__": diff --git a/pyglossary/ui/ui_cmd_interactive.py b/pyglossary/ui/ui_cmd_interactive.py index 48f93cd91..05250fb02 100644 --- a/pyglossary/ui/ui_cmd_interactive.py +++ b/pyglossary/ui/ui_cmd_interactive.py @@ -47,7 +47,6 @@ import logging import os import shlex -from collections import OrderedDict from os.path import ( abspath, dirname, @@ -304,31 +303,27 @@ def __init__( " -l, --long use a long listing format\n" ) - self._fsActions = OrderedDict( - [ - ("!pwd", (self.fs_pwd, "")), - ("!ls", (self.fs_ls, self.ls_usage)), - ("!..", (self.fs_cd_parent, "")), - ("!cd", (self.fs_cd, "")), - ], - ) - self._finalActions = OrderedDict( - [ - ("formats", self.askFormats), - ("read-options", self.askReadOptions), - ("write-options", self.askWriteOptions), - ("reset-read-options", self.resetReadOptions), - ("reset-write-options", self.resetWriteOptions), - ("config", self.askConfig), - ("indirect", self.setIndirect), - ("sqlite", self.setSQLite), - ("no-progressbar", self.setNoProgressbar), - ("sort", self.setSort), - ("sort-key", self.setSortKey), - ("show-options", self.showOptions), - ("back", None), - ], - ) + self._fsActions = { + "!pwd": (self.fs_pwd, ""), + "!ls": (self.fs_ls, self.ls_usage), + "!..": (self.fs_cd_parent, ""), + "!cd": (self.fs_cd, ""), + } + self._finalActions = { + "formats": self.askFormats, + "read-options": self.askReadOptions, + "write-options": self.askWriteOptions, + "reset-read-options": self.resetReadOptions, + "reset-write-options": self.resetWriteOptions, + "config": self.askConfig, + "indirect": self.setIndirect, + "sqlite": self.setSQLite, + "no-progressbar": self.setNoProgressbar, + "sort": self.setSort, + "sort-key": self.setSortKey, + "show-options": self.showOptions, + "back": None, + } @staticmethod def fs_pwd(args: list[str]): diff --git a/pyglossary/ui/ui_gtk.py b/pyglossary/ui/ui_gtk.py index 05ba5a97b..9632ce23d 100644 --- a/pyglossary/ui/ui_gtk.py +++ b/pyglossary/ui/ui_gtk.py @@ -22,7 +22,6 @@ import logging import sys import traceback -from collections import OrderedDict from os.path import abspath, isfile from typing import TYPE_CHECKING, Any @@ -953,17 +952,15 @@ def __init__(self, ui, **kwargs) -> None: pack(hbox, self.sqliteCheck, 0, 0, padding=hpad) pack(self.vbox, hbox, 0, 0, padding=vpad) ## - self.configParams = OrderedDict( - [ - ("save_info_json", False), - ("lower", False), - ("skip_resources", False), - ("rtl", False), - ("enable_alts", True), - ("cleanup", True), - ("remove_html_all", True), - ], - ) + self.configParams = { + "save_info_json": False, + "lower": False, + "skip_resources": False, + "rtl": False, + "enable_alts": True, + "cleanup": True, + "remove_html_all": True, + } self.configCheckButtons = {} configDefDict = UIBase.configDefDict for param in self.configParams: diff --git a/pyglossary/ui/ui_gtk4.py b/pyglossary/ui/ui_gtk4.py index 81d62156b..570ba5551 100644 --- a/pyglossary/ui/ui_gtk4.py +++ b/pyglossary/ui/ui_gtk4.py @@ -22,7 +22,6 @@ import logging import sys import traceback -from collections import OrderedDict from os.path import abspath, isfile from typing import TYPE_CHECKING, Any @@ -981,17 +980,15 @@ def __init__(self, mainWin, **kwargs) -> None: pack(hbox, self.sqliteCheck) pack(self.vbox, hbox) ## - self.configParams = OrderedDict( - [ - ("save_info_json", False), - ("lower", False), - ("skip_resources", False), - ("rtl", False), - ("enable_alts", True), - ("cleanup", True), - ("remove_html_all", True), - ], - ) + self.configParams = { + "save_info_json": False, + "lower": False, + "skip_resources": False, + "rtl": False, + "enable_alts": True, + "cleanup": True, + "remove_html_all": True, + } self.configCheckButtons = {} configDefDict = UIBase.configDefDict for param in self.configParams: diff --git a/pyglossary/xdxf/transform.py b/pyglossary/xdxf/transform.py index 9f51816de..0bbf59575 100644 --- a/pyglossary/xdxf/transform.py +++ b/pyglossary/xdxf/transform.py @@ -143,6 +143,12 @@ def addSep() -> None: addSep() def _write_example(self, hf: T_htmlfile, elem: Element) -> None: + children = elem.xpath("child::node()") + if not children: + return + if not isinstance(children, list): + log.warning(f"unexpected {children=}") + return prev = None stringSep = " " with hf.element( @@ -152,12 +158,18 @@ def _write_example(self, hf: T_htmlfile, elem: Element) -> None: "style": f"padding: {self._example_padding}px 0px;", }, ): - for child in elem.xpath("child::node()"): + for child in children: if isinstance(child, str): # if not child.strip(): # continue self.writeString(hf, child, elem, prev, stringSep=stringSep) continue + if isinstance(child, bytes | tuple): + # TODO + log.warning(f"unexpected {child=}") + continue + if not child: + continue if child.tag == "iref": with hf.element("div"): self._write_iref(hf, child) # NESTED 5 @@ -404,10 +416,19 @@ def writeChildrenOf( sep: str | None = None, stringSep: str | None = None, ) -> None: + children = elem.xpath("child::node()") + if not children: + return + if not isinstance(children, list): + log.warning(f"unexpceted {children=}") + return prev = None - for child in elem.xpath("child::node()"): + for child in children: if sep and prev is not None and self.shouldAddSep(child, prev): hf.write(sep) + if isinstance(child, bytes | tuple): + log.warning(f"unexpected {child=}") + continue self.writeChild(hf, child, elem, prev, stringSep=stringSep) prev = child diff --git a/pyproject.toml b/pyproject.toml index 8c5527d11..9b6f4059c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -186,9 +186,6 @@ mccabe.max-complexity = 13 # Unlike Flake8, default to a complexity level of 10. "zimfile.py" = [ "C901", # `...` is too complex ] -"iupac_goldbook.py" = [ - "C901", # `...` is too complex -] "pyglossary/plugins/babylon_bgl/*.py" = [ "C901", # `...` is too complex ] @@ -406,7 +403,7 @@ build-backend = "setuptools.build_meta" [project] name = "pyglossary" -version = "5.0.0" +version = "5.0.1" description = "A tool for converting dictionary files aka glossaries." readme = "README.md" authors = [{ name = "Saeed Rasooli", email = "saeed.gnu@gmail.com" }] diff --git a/scripts/plugin-doc.py b/scripts/plugin-doc.py index 23db48278..b2d86a2e5 100755 --- a/scripts/plugin-doc.py +++ b/scripts/plugin-doc.py @@ -1,7 +1,6 @@ #!/usr/bin/python3 import sys -from collections import OrderedDict from os.path import abspath, dirname, join from pathlib import Path @@ -236,7 +235,7 @@ def getToolSourceLink(tool): toolsFile = join(toolsDir, f"{p.lname}.toml") try: with open(toolsFile, encoding="utf-8") as _file: - tools_toml = toml.load(_file, _dict=OrderedDict) + tools_toml = toml.load(_file) except FileNotFoundError: tools = [] except Exception as e: diff --git a/scripts/plugin-index.py b/scripts/plugin-index.py index 14c90e5aa..9f342d86e 100755 --- a/scripts/plugin-index.py +++ b/scripts/plugin-index.py @@ -2,9 +2,9 @@ import json import sys -from collections import OrderedDict as odict from os.path import abspath, dirname, join from pathlib import Path +from typing import Any rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) @@ -27,22 +27,17 @@ for p in plugins: canRead = p.canRead canWrite = p.canWrite - item = odict( - [ - ("module", p.module.__name__), - ("lname", p.lname), - ("name", p.name), - ("description", p.description), - ("extensions", p.extensions), - ("singleFile", p.singleFile), - ( - "optionsProp", - {name: opt.toDict() for name, opt in p.optionsProp.items()}, - ), - ("canRead", canRead), - ("canWrite", canWrite), - ], - ) + item: dict[str, Any] = { + "module": p.module.__name__, + "lname": p.lname, + "name": p.name, + "description": p.description, + "extensions": p.extensions, + "singleFile": p.singleFile, + "optionsProp": {name: opt.toDict() for name, opt in p.optionsProp.items()}, + "canRead": canRead, + "canWrite": canWrite, + } if p.sortOnWrite != DEFAULT_NO: item["sortOnWrite"] = p.sortOnWrite if p.sortKeyName: diff --git a/scripts/test-cover-html-plugin.sh b/scripts/test-cover-html-plugin.sh index 4e234cab3..353f07671 100755 --- a/scripts/test-cover-html-plugin.sh +++ b/scripts/test-cover-html-plugin.sh @@ -1,11 +1,49 @@ #!/usr/bin/env bash set -e -# l_name of plugin, for example "stardict" or "octopus_mdict" -pluginLname="$1" +pluginLookup="$1" +if [ -z "$pluginLookup" ]; then + echo 'Must give plugins l_name as argument, for example "stardict" or "octopus_mdict"' + exit 1 +fi -rootDir=$(dirname $(dirname "$0")) +rootDir=$(dirname $(realpath $(dirname "$0"))) +echo $rootDir + +cd $rootDir/pyglossary/plugins/ +pluginLname=$(ls -1d $pluginLookup* | grep -v 'cover' | sort | head -n1 | sed 's/\.py$//') +if [ -z "$pluginLname" ]; then + echo "Did not find a plugin matching '$pluginLookup'" + exit 1 +fi + +if [ -f "$rootDir/pyglossary/plugins/${pluginLname}.py" ]; then + filePaths="$rootDir/pyglossary/plugins/${pluginLname}.py" +elif [ -d "$rootDir/pyglossary/plugins/${pluginLname}" ]; then + filePaths="$rootDir/pyglossary/plugins/${pluginLname}/*.py" +else + echo "Did not find a plugin matching '$pluginLookup'" + exit 1 +fi + +echo "Using plugin name '$pluginLname'" + +dataFile="$rootDir/pyglossary/plugins/${pluginLname}.cover" + +outDir="$rootDir/pyglossary/plugins/${pluginLname}.coverhtml" +mkdir -p $outDir +# echo "file://$outDir/index.html" cd "$rootDir/tests" -coverage run -m unittest "g_${pluginLname}_test.py" -coverage html --include="$rootDir/pyglossary/plugins/${pluginLname}*" + +set -x +coverage run --data-file="$dataFile" -m unittest "g_${pluginLname}_test.py" +coverage html --data-file="$dataFile" \ + --include="$filePaths" \ + --directory=$outDir || + echo "'coverage html' failed with $?" +set +x + +if [ -f "$outDir/index.html" ]; then + echo "file://$outDir/index.html" +fi diff --git a/scripts/test-cover-html.sh b/scripts/test-cover-html.sh index 43e3b6fac..cfba0315a 100755 --- a/scripts/test-cover-html.sh +++ b/scripts/test-cover-html.sh @@ -1,10 +1,11 @@ #!/usr/bin/env bash set -e -rootDir=$(dirname $(dirname "$0")) +rootDir=$(dirname $(realpath $(dirname "$0"))) +echo "file://$rootDir/tests/htmlcov/index.html" cd "$rootDir/tests" coverage run -m unittest ./*_test.py -coverage html --include="$rootDir/pyglossary/*" --omit="$rootDir/pyglossary/plugin_lib/*" +coverage html --include="$rootDir/pyglossary/*" --omit="$rootDir/pyglossary/plugin_lib/*" || echo "'coverage html' failed with $?" echo "file://$rootDir/tests/htmlcov/index.html" diff --git a/scripts/test-deps.sh b/scripts/test-deps.sh index 152e26181..272d3131c 100755 --- a/scripts/test-deps.sh +++ b/scripts/test-deps.sh @@ -5,4 +5,5 @@ python -m pip install \ python-idzip \ lxml==5.3 \ marisa-trie \ - mistune + mistune \ + polib diff --git a/scripts/tools-py2toml.py b/scripts/tools-py2toml.py index 720ef83bb..bdcf07d43 100755 --- a/scripts/tools-py2toml.py +++ b/scripts/tools-py2toml.py @@ -1,7 +1,6 @@ #!/usr/bin/python3 import sys -from collections import OrderedDict from os.path import abspath, dirname, join from pathlib import Path @@ -28,7 +27,7 @@ module = p.module optionsProp = p.optionsProp - tools = OrderedDict() + tools = {} for tool in getattr(p.module, "tools", []): tools[tool.pop("name")] = tool diff --git a/scripts/type-checker-deps.sh b/scripts/type-checker-deps.sh new file mode 100644 index 000000000..108015095 --- /dev/null +++ b/scripts/type-checker-deps.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +python3 -m pip install lxml-stubs types-beautifulsoup4 types-psutil types-polib + diff --git a/scripts/wiktextract/extract-schema.py b/scripts/wiktextract/extract-schema.py index 644167bc7..fc3b8c057 100644 --- a/scripts/wiktextract/extract-schema.py +++ b/scripts/wiktextract/extract-schema.py @@ -1,7 +1,6 @@ import json import sys from collections import Counter -from collections import OrderedDict as odict from dataclasses import dataclass from typing import Any @@ -30,7 +29,7 @@ def __dict__(self): else: keys.insert(0, "word") return { - "__dict__": odict((key, self.Dict[key].__dict__) for key in keys), + "__dict__": {key: self.Dict[key].__dict__ for key in keys}, # "__key_score__": self.keyScoreList(), } diff --git a/setup.py b/setup.py index 742329690..536b1ac0b 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ from setuptools import setup from setuptools.command.install import install -VERSION = "5.0.0" +VERSION = "5.0.1" log = logging.getLogger("root") relRootDir = "share/pyglossary" diff --git a/tests/g_csv_test.py b/tests/g_csv_plugin_test.py similarity index 100% rename from tests/g_csv_test.py rename to tests/g_csv_plugin_test.py diff --git a/tests/g_gettext_po_test.py b/tests/g_gettext_po_test.py new file mode 100644 index 000000000..24930131f --- /dev/null +++ b/tests/g_gettext_po_test.py @@ -0,0 +1,51 @@ +import sys +import unittest +from os.path import abspath, dirname + +rootDir = dirname(dirname(abspath(__file__))) +sys.path.insert(0, rootDir) + +from glossary_v2_test import TestGlossaryBase + + +class TestGlossaryGetttestPo(TestGlossaryBase): + def __init__(self, *args, **kwargs): + TestGlossaryBase.__init__(self, *args, **kwargs) + + self.dataFileCRC32.update( + { + "100-en-fa.po": "694de186", + "100-en-fa.po.txt": "f0c3ea53", + }, + ) + + def convert_txt_po(self, fname, fname2, **convertArgs): + self.convert( + f"{fname}.txt", + f"{fname}-2.po", + compareText=f"{fname2}.po", + **convertArgs, + ) + + def convert_po_txt(self, fname, fname2, **convertArgs): + self.convert( + f"{fname}.po", + f"{fname}-2.txt", + compareText=f"{fname2}.txt", + **convertArgs, + ) + + def test_convert_txt_po_1(self): + self.convert_txt_po("100-en-fa", "100-en-fa") + + # TODO + def test_convert_po_txt_1(self): + self.convert_po_txt( + "100-en-fa", + "100-en-fa.po", + infoOverride={"input_file_size": None}, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/g_stardict_merge_syns_test.py b/tests/g_stardict_merge_syns_test.py index ae7250ca6..2017173e5 100644 --- a/tests/g_stardict_merge_syns_test.py +++ b/tests/g_stardict_merge_syns_test.py @@ -5,155 +5,71 @@ rootDir = dirname(dirname(abspath(__file__))) sys.path.insert(0, rootDir) -from glossary_v2_errors_test import TestGlossaryErrorsBase -from pyglossary.glossary_v2 import ConvertArgs, Glossary +from g_stardict_test import TestGlossaryStarDictBase +from glossary_v2_errors_test import TestGlossaryErrorsBase __all__ = ["TestGlossaryStarDictMergeSyns"] -class TestGlossaryStarDictMergeSynsBase(TestGlossaryErrorsBase): - def convert_txt_stardict( # noqa: PLR0913 - self, - fname, - fname2="", - syn=True, - dictzip=False, - config=None, - writeOptions=None, - info=None, - **convertArgs, - ): - if not fname2: - fname2 = fname - - binExtList = ["idx", "dict"] - if syn: - binExtList.append("syn") - - inputFilename = self.downloadFile(f"{fname}.txt") - outputFilename = self.newTempFilePath(f"{fname}.ifo") - otherFiles = {ext: self.newTempFilePath(f"{fname}.{ext}") for ext in binExtList} - - glos = self.glos = Glossary() - if info: - for key, value in info.items(): - glos.setInfo(key, value) - - if config is not None: - glos.config = config - - if writeOptions is None: - writeOptions = {} - writeOptions["dictzip"] = dictzip - - result = glos.convert( - ConvertArgs( - inputFilename=inputFilename, - outputFilename=outputFilename, - writeOptions=writeOptions, - outputFormat="StardictMergeSyns", - **convertArgs, - ) - ) - self.assertEqual(outputFilename, result) - - self.compareTextFiles( - outputFilename, - self.downloadFile(f"{fname2}.sd/{fname2}.ifo"), - ) - - for ext in binExtList: - self.compareBinaryFiles( - otherFiles[ext], - self.downloadFile(f"{fname2}.sd/{fname2}.{ext}"), - ) - - def convert_txt_stardict_zip( # noqa: PLR0913 - self, - fname, - sha1sumDict, - dictzip=False, - config=None, - **convertArgs, - ): - inputFilename = self.downloadFile(f"{fname}.txt") - outputFilename = self.newTempFilePath(f"{fname}.zip") - - glos = self.glos = Glossary() - - if config is not None: - glos.config = config - - result = glos.convert( - ConvertArgs( - inputFilename=inputFilename, - outputFilename=outputFilename, - outputFormat="StardictMergeSyns", - writeOptions={ - "dictzip": dictzip, - }, - **convertArgs, - ) - ) - self.assertEqual(outputFilename, result) - - self.checkZipFileSha1sum( - outputFilename, - sha1sumDict=sha1sumDict, - ) - +class TestGlossaryStarDictMergeSyns(TestGlossaryStarDictBase): + def convert_txt_stardict(self, *args, **kwargs): + kwargs["outputFormat"] = "StardictMergeSyns" + TestGlossaryStarDictBase.convert_txt_stardict(self, *args, **kwargs) -class TestGlossaryStarDictMergeSyns(TestGlossaryStarDictMergeSynsBase): def __init__(self, *args, **kwargs): TestGlossaryErrorsBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update( { - "100-en-de-v4.sd/100-en-de-v4.dict": "5a97476f", - "100-en-de-v4.sd/100-en-de-v4.idx": "a99f29d2", - "100-en-de-v4.sd/100-en-de-v4.ifo": "6529871f", - "100-en-fa-merge-syns.sd/100-en-fa-merge-syns.dict": "223a0d1d", - "100-en-fa-merge-syns.sd/100-en-fa-merge-syns.idx": "13f1c7af", - "100-en-fa-merge-syns.sd/100-en-fa-merge-syns.ifo": "07338eed", "002-plain-html.txt": "75484314", - "002-plain-html.sd/002-plain-html.dict": "2e9d20d8", - "002-plain-html.sd/002-plain-html.idx": "3956ad72", - "002-plain-html.sd/002-plain-html.ifo": "1991f125", "004-plain-html-alts.txt": "505d4675", - "004-plain-html-alts-merge-syns.sd/" - "004-plain-html-alts-merge-syns.dict": "889f11f8", - "004-plain-html-alts-merge-syns.sd/" - "004-plain-html-alts-merge-syns.idx": "092ba555", - "004-plain-html-alts-merge-syns.sd/" - "004-plain-html-alts-merge-syns.ifo": "628abe99", + "002-plain-html-sd-merge-syns-v2/002-plain-html.dict": "2e9d20d8", + "002-plain-html-sd-merge-syns-v2/002-plain-html.idx": "3956ad72", + "002-plain-html-sd-merge-syns-v2/002-plain-html.ifo": "1991f125", + "004-plain-html-alts-sd-merge-syns-v2/004-plain-html-alts.dict": "889f11f8", + "004-plain-html-alts-sd-merge-syns-v2/004-plain-html-alts.idx": "092ba555", + "004-plain-html-alts-sd-merge-syns-v2/004-plain-html-alts.ifo": "628abe99", + "004-plain-html-alts-sd-merge-syns-v2/004-plain-html-alts.syn": "c07f7111", + "100-en-de-v4-sd-merge-syns-v2/100-en-de-v4.dict": "5a97476f", + "100-en-de-v4-sd-merge-syns-v2/100-en-de-v4.idx": "a99f29d2", + "100-en-de-v4-sd-merge-syns-v2/100-en-de-v4.ifo": "2120708c", + "100-en-fa-sd-merge-syns-v2/100-en-fa.dict": "223a0d1d", + "100-en-fa-sd-merge-syns-v2/100-en-fa.idx": "13f1c7af", + "100-en-fa-sd-merge-syns-v2/100-en-fa.ifo": "248ef828", }, ) def test_convert_txt_stardict_1_merge_syns(self): self.convert_txt_stardict( "100-en-fa", - fname2="100-en-fa-merge-syns", + "100-en-fa-sd-merge-syns-v2", syn=False, + # dictzip=False, ) def test_convert_txt_stardict_3_merge_syns(self): self.convert_txt_stardict( "100-en-de-v4", + "100-en-de-v4-sd-merge-syns-v2", syn=False, + # dictzip=False, ) def test_convert_txt_stardict_general_1_merge_syns(self): self.convert_txt_stardict( "002-plain-html", + "002-plain-html-sd-merge-syns-v2", syn=False, + # dictzip=False, ) def test_convert_txt_stardict_general_2_merge_syns(self): self.convert_txt_stardict( "004-plain-html-alts", - fname2="004-plain-html-alts-merge-syns", + "004-plain-html-alts-sd-merge-syns-v2", syn=False, + # dictzip=False, ) diff --git a/tests/g_stardict_sort_test.py b/tests/g_stardict_sort_test.py index 553936e58..f6916d6a2 100644 --- a/tests/g_stardict_sort_test.py +++ b/tests/g_stardict_sort_test.py @@ -1,3 +1,4 @@ +import os import unittest from g_stardict_test import TestGlossaryStarDictBase @@ -9,10 +10,11 @@ def __init__(self, *args, **kwargs): TestGlossaryErrorsBase.__init__(self, *args, **kwargs) self.dataFileCRC32.update( { - "100-en-fa.sd/100-en-fa.dict": "223a0d1d", - "100-en-fa.sd/100-en-fa.idx": "6df43378", - "100-en-fa.sd/100-en-fa.ifo": "3f2086cd", - "100-en-fa.sd/100-en-fa.syn": "1160fa0b", + "100-en-fa-sd-v2/100-en-fa.dict": "223a0d1d", + "100-en-fa-sd-v2/100-en-fa.idx": "6df43378", + "100-en-fa-sd-v2/100-en-fa.ifo": "bb916827", + "100-en-fa-sd-v2/100-en-fa.syn": "1160fa0b", + "100-en-fa-sd-v2.txt": "0b8b2ac0", "100-en-fa-sd.txt": "85f9d3fc", }, ) @@ -24,6 +26,7 @@ def convert_txt_stardict_enfa( ): self.convert_txt_stardict( fname, + fname + "-sd-v2", config={"enable_alts": True}, info={ "sourceLang": "English", @@ -80,6 +83,11 @@ def test_convert_txt_stardict_enfa_4(self): ", and using sortKey function from Stardict plugin", ) + def test_convert_txt_stardict_enfa_5(self): + os.environ["NO_SQLITE"] = "1" + self.convert_txt_stardict_enfa("100-en-fa", sqlite=False) + del os.environ["NO_SQLITE"] + if __name__ == "__main__": unittest.main() diff --git a/tests/g_stardict_test.py b/tests/g_stardict_test.py index 4a9d9a098..f4904643b 100644 --- a/tests/g_stardict_test.py +++ b/tests/g_stardict_test.py @@ -16,7 +16,7 @@ class TestGlossaryStarDictBase(TestGlossaryErrorsBase): def convert_txt_stardict( # noqa: PLR0913 self, fname, - fname2="", + sdDirName, syn=True, dictzip=False, config=None, @@ -24,9 +24,6 @@ def convert_txt_stardict( # noqa: PLR0913 info=None, **convertArgs, ): - if not fname2: - fname2 = fname - binExtList = ["idx", "dict"] if syn: binExtList.append("syn") @@ -59,13 +56,13 @@ def convert_txt_stardict( # noqa: PLR0913 self.compareTextFiles( outputFilename, - self.downloadFile(f"{fname2}.sd/{fname2}.ifo"), + self.downloadFile(f"{sdDirName}/{fname}.ifo"), ) for ext in binExtList: self.compareBinaryFiles( otherFiles[ext], - self.downloadFile(f"{fname2}.sd/{fname2}.{ext}"), + self.downloadFile(f"{sdDirName}/{fname}.{ext}"), ) def convert_txt_stardict_zip( # noqa: PLR0913 @@ -105,6 +102,7 @@ def convert_txt_stardict_zip( # noqa: PLR0913 def convert_stardict_txt( self, inputFname: str, + inputDirName: str, outputFname: str, testId: str, syn=True, @@ -114,9 +112,9 @@ def convert_stardict_txt( if syn: binExtList.append("syn") for ext in binExtList: - self.downloadFile(f"{inputFname}.sd/{inputFname}.{ext}") + self.downloadFile(f"{inputDirName}/{inputFname}.{ext}") - inputFilename = self.downloadFile(f"{inputFname}.sd/{inputFname}.ifo") + inputFilename = self.downloadFile(f"{inputDirName}/{inputFname}.ifo") outputFilename = self.newTempFilePath( f"{inputFname}-{testId}.txt", ) @@ -145,14 +143,14 @@ def __init__(self, *args, **kwargs): "004-bar.sd/004-bar.idx": "cf9440cf", "004-bar.sd/004-bar.ifo": "ada870e4", "004-bar.sd/004-bar.syn": "286b17bf", - "100-en-de-v4.sd/100-en-de-v4.dict": "5a97476f", - "100-en-de-v4.sd/100-en-de-v4.idx": "a99f29d2", - "100-en-de-v4.sd/100-en-de-v4.ifo": "6529871f", - "100-en-fa.sd/100-en-fa.dict": "223a0d1d", - "100-en-fa.sd/100-en-fa.idx": "6df43378", - "100-en-fa.sd/100-en-fa.ifo": "3f2086cd", - "100-en-fa.sd/100-en-fa.syn": "1160fa0b", - "100-en-fa-sd.txt": "85f9d3fc", + "100-en-de-v4-sd-v2/100-en-de-v4.dict": "5a97476f", + "100-en-de-v4-sd-v2/100-en-de-v4.idx": "a99f29d2", + "100-en-de-v4-sd-v2/100-en-de-v4.ifo": "2120708c", + "100-en-fa-sd-v2/100-en-fa.dict": "223a0d1d", + "100-en-fa-sd-v2/100-en-fa.idx": "6df43378", + "100-en-fa-sd-v2/100-en-fa.ifo": "bb916827", + "100-en-fa-sd-v2/100-en-fa.syn": "1160fa0b", + "100-en-fa-sd-v2.txt": "0b8b2ac0", # FIXME: remove empty description line from 100-en-fa.ifo # stardict-mixed-types-1.ifo, "stardict-mixed-types-2.ifo "100-ja-en.sd/100-ja-en.dict": "39715f01", @@ -183,6 +181,7 @@ def __init__(self, *args, **kwargs): def test_convert_txt_stardict_0(self): self.convert_txt_stardict( "100-en-fa", + "100-en-fa-sd-v2", config={"auto_sqlite": True}, direct=True, ) @@ -191,6 +190,7 @@ def test_convert_txt_stardict_1(self): for sqlite in (None, False, True): self.convert_txt_stardict( "100-en-fa", + "100-en-fa-sd-v2", sqlite=sqlite, ) @@ -198,7 +198,7 @@ def test_convert_txt_stardict_1_zip(self): sha1sumDict = { "100-en-fa.dict": "1e462e829f9e2bf854ceac2ef8bc55911460c79e", "100-en-fa.idx": "943005945b35abf3a3e7b80375c76daa87e810f0", - "100-en-fa.ifo": "3e982a76f83eef66a8d4915e7a0018746f4180bc", + "100-en-fa.ifo": "bf12a932385f54dfcf5ab023d89a8dbd7091e60f", "100-en-fa.syn": "fcefc76628fed18b84b9aa83cd7139721b488545", } for sqlite in (None, False, True): @@ -212,6 +212,7 @@ def test_convert_txt_stardict_2(self): for sqlite in (None, False, True): self.convert_txt_stardict( "004-bar", + "004-bar.sd", sqlite=sqlite, ) @@ -219,6 +220,7 @@ def test_convert_txt_stardict_3(self): for sqlite in (None, False, True): self.convert_txt_stardict( "100-en-de-v4", + "100-en-de-v4-sd-v2", syn=False, sqlite=sqlite, ) @@ -227,6 +229,7 @@ def test_convert_txt_stardict_4(self): for sqlite in (None, False, True): self.convert_txt_stardict( "100-ja-en", + "100-ja-en.sd", syn=True, sqlite=sqlite, ) @@ -235,6 +238,7 @@ def test_convert_txt_stardict_5(self): for sqlite in (None, False, True): self.convert_txt_stardict( "300-ru-en", + "300-ru-en.sd", syn=True, sqlite=sqlite, ) @@ -242,6 +246,7 @@ def test_convert_txt_stardict_5(self): def test_convert_txt_stardict_sqlite_no_alts(self): self.convert_txt_stardict( "100-en-fa", + "100-en-fa-sd-v2", config={"enable_alts": False}, sqlite=True, ) @@ -252,7 +257,8 @@ def test_convert_txt_stardict_sqlite_no_alts(self): def test_convert_stardict_txt_1(self): self.convert_stardict_txt( "100-en-fa", - "100-en-fa-sd", + "100-en-fa-sd-v2", + "100-en-fa-sd-v2", "1", ) @@ -260,6 +266,7 @@ def test_convert_stardict_txt_mixed_types_1(self): self.convert_stardict_txt( "stardict-mixed-types-2", "stardict-mixed-types-2.sd", + "stardict-mixed-types-2.sd", "mixed-types-1", syn=False, ) @@ -268,6 +275,7 @@ def test_convert_stardict_txt_mixed_types_2(self): self.convert_stardict_txt( "stardict-mixed-types-2", "stardict-mixed-types-2.sd", + "stardict-mixed-types-2.sd", "mixed-types-1", syn=False, readOptions={"xdxf_to_html": False}, @@ -276,12 +284,14 @@ def test_convert_stardict_txt_mixed_types_2(self): def test_convert_txt_stardict_general_1(self): self.convert_txt_stardict( "002-plain-html", + "002-plain-html.sd", syn=False, ) def test_convert_txt_stardict_general_2(self): self.convert_txt_stardict( "004-plain-html-alts", + "004-plain-html-alts.sd", syn=True, ) diff --git a/tests/g_wiktextract_test.py b/tests/g_wiktextract_test.py new file mode 100644 index 000000000..b64a025f2 --- /dev/null +++ b/tests/g_wiktextract_test.py @@ -0,0 +1,82 @@ +import sys +import unittest +from os.path import abspath, dirname + +rootDir = dirname(dirname(abspath(__file__))) +sys.path.insert(0, rootDir) + +from glossary_v2_test import TestGlossaryBase + + +class TestGlossaryWiktextract(TestGlossaryBase): + def __init__(self, *args, **kwargs): + TestGlossaryBase.__init__(self, *args, **kwargs) + + self.dataFileCRC32.update( + { + "wiktextract/10-kaikki-fa-PlacesInIran.jsonl": "f7f4a92f", + "wiktextract/10-kaikki-fa-PlacesInIran.txt": "29b20845", + "wiktextract/10-kaikki-fa-PlacesInIran-category.txt": "d12fa9c0", + "wiktextract/10-kaikki-fa-pos-adv.jsonl": "2ddcbbbd", + "wiktextract/10-kaikki-fa-pos-adv.txt": "fbaa9972", + "wiktextract/10-kaikki-fa-pos-adv-word_title.txt": "4933de91", + "wiktextract/03-kaikki-fa-selection.jsonl": "31223225", + "wiktextract/03-kaikki-fa-selection.txt": "f54d1a97", + }, + ) + + def convert_jsonl_txt(self, fname, fname2, **convertArgs): + self.convert( + f"wiktextract/{fname}.jsonl", + f"{fname}-2.txt", + compareText=f"wiktextract/{fname2}.txt", + infoOverride={ + # without this, glos name would become f"wiktextract__{fname}.jsonl" + "name": f"{fname}.jsonl", + }, + **convertArgs, + ) + + def test_convert_jsonl_txt_1(self): + self.convert_jsonl_txt( + "10-kaikki-fa-PlacesInIran", + "10-kaikki-fa-PlacesInIran", + ) + + def test_convert_jsonl_txt_1_cats(self): + self.convert_jsonl_txt( + "10-kaikki-fa-PlacesInIran", + "10-kaikki-fa-PlacesInIran-category", + readOptions={ + "categories": True, + }, + ) + + def test_convert_jsonl_txt_2(self): + self.convert_jsonl_txt( + "10-kaikki-fa-pos-adv", + "10-kaikki-fa-pos-adv", + ) + + def test_convert_jsonl_txt_2_word_title(self): + self.convert_jsonl_txt( + "10-kaikki-fa-pos-adv", + "10-kaikki-fa-pos-adv-word_title", + readOptions={ + "word_title": True, + }, + ) + + def test_convert_jsonl_txt_3(self): + self.convert_jsonl_txt( + "03-kaikki-fa-selection", + "03-kaikki-fa-selection", + ) + # testing these features + # "antonyms" in sense + # "topics" in sense + # "form_of" in sense + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/glossary_v2_test.py b/tests/glossary_v2_test.py index a7b877f65..d07a72c8b 100644 --- a/tests/glossary_v2_test.py +++ b/tests/glossary_v2_test.py @@ -282,9 +282,31 @@ def convert( # noqa: PLR0913 self.assertEqual(md5sum, actualMd5, msg) return - def convert_sqlite_both(self, *args, **kwargs): + def convert_txt_txt( + self, + fname, # input txt file without extension + fname2, # expected output txt file without extension + fnamePrefix="", + testId="tmp", + config=None, + **convertArgs, + ): + self.convert( + f"{fnamePrefix}{fname}.txt", + f"{fname2}-{testId}.txt", + compareText=f"{fnamePrefix}{fname2}.txt", + testId=testId, + config=config, + **convertArgs, + ) + + def convert_txt_txt_sort(self, *args, **convertArgs): for sqlite in (None, True, False): - self.convert(*args, sqlite=sqlite, **kwargs) + self.convert_txt_txt(*args, sort=True, sqlite=sqlite, **convertArgs) + + os.environ["NO_SQLITE"] = "1" + self.convert_txt_txt(*args, sort=True, sqlite=False, **convertArgs) + del os.environ["NO_SQLITE"] class TestGlossary(TestGlossaryBase): @@ -295,6 +317,7 @@ def __init__(self, *args, **kwargs): { "100-en-fa-sort.txt": "d7a82dc8", "100-en-fa-sort-headword.txt": "4067a29f", + "100-en-fa-sort-headword-fa.txt": "d01fcee1", "100-en-fa-sort-ebook.txt": "aa620d07", "100-en-fa-sort-ebook3.txt": "5a20f140", "100-en-fa-lower.txt": "62178940", @@ -544,23 +567,6 @@ def test_lang_detect_5(self): ("English", "German"), ) - def convert_txt_txt( - self, - fname, # input txt file without extension - fname2, # expected output txt file without extension - testId="tmp", - config=None, - **convertArgs, - ): - self.convert( - f"{fname}.txt", - f"{fname2}-{testId}.txt", - compareText=f"{fname2}.txt", - testId=testId, - config=config, - **convertArgs, - ) - def convert_to_txtZip( self, fname, # input file with extension @@ -604,75 +610,67 @@ def test_txt_txtZip_1(self): ) def test_sort_1(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "100-en-fa", "100-en-fa-sort", testId="sort_1", - sort=True, ) def test_sort_2(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "100-en-fa", "100-en-fa-sort", testId="sort_2", - sort=True, sortKeyName="headword_lower", ) def test_sort_3(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "100-en-fa", "100-en-fa-sort-headword", testId="sort_3", - sort=True, sortKeyName="headword", ) def test_sort_4(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "300-rand-en-fa", "300-rand-en-fa-sort-headword", testId="sort_4", - sort=True, sortKeyName="headword", ) def test_sort_5(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "300-rand-en-fa", "300-rand-en-fa-sort-headword-w1256", testId="sort_5", - sort=True, sortKeyName="headword", sortEncoding="windows-1256", ) def test_sort_6(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "300-rand-en-fa", "300-rand-en-fa-sort-w1256", testId="sort_6", - sort=True, sortKeyName="headword_lower", sortEncoding="windows-1256", ) def test_sort_7(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "100-en-fa", "100-en-fa-sort-ebook", testId="sort_7", - sort=True, sortKeyName="ebook", ) def test_sort_8(self): - self.convert_txt_txt( + self.convert_txt_txt_sort( "100-en-fa", "100-en-fa-sort-ebook3", testId="sort_8", - sort=True, sortKeyName="ebook_length3", ) @@ -757,14 +755,11 @@ def test_txt_txt_bar(self): ) def test_txt_txt_bar_sort(self): - for sqlite in (None, False, True): - self.convert_txt_txt( - "004-bar", - "004-bar-sort", - testId="bar_sort", - sort=True, - sqlite=sqlite, - ) + self.convert_txt_txt_sort( + "004-bar", + "004-bar-sort", + testId="bar_sort", + ) def test_txt_txt_empty_filtered(self): for direct in (None, False, True): @@ -969,49 +964,49 @@ def test_wordTitleStr_cjk2(self): ) def test_convert_sortLocale_default_1(self): - name = "092-en-fa-alphabet-sample" - self.convert_sqlite_both( - f"sort-locale/{name}.txt", - f"{name}-sorted-default.txt", - compareText=f"sort-locale/{name}-sorted-default.txt", + self.convert_txt_txt_sort( + "092-en-fa-alphabet-sample", + "092-en-fa-alphabet-sample-sorted-default", + fnamePrefix="sort-locale/", testId="sorted-default", - sort=True, sortKeyName="headword_lower", ) def test_convert_sortLocale_en_1(self): - name = "092-en-fa-alphabet-sample" - self.convert_sqlite_both( - f"sort-locale/{name}.txt", - f"{name}-sorted-en.txt", - compareText=f"sort-locale/{name}-sorted-en.txt", - testId="sorted-en", - sort=True, + self.convert_txt_txt_sort( + "092-en-fa-alphabet-sample", + "092-en-fa-alphabet-sample-sorted-en", + fnamePrefix="sort-locale/", + testId="sorted-en-headword_lower", sortKeyName="headword_lower:en_US.UTF-8", ) def test_convert_sortLocale_fa_1(self): - name = "092-en-fa-alphabet-sample" - self.convert_sqlite_both( - f"sort-locale/{name}.txt", - f"{name}-sorted-fa.txt", - compareText=f"sort-locale/{name}-sorted-fa.txt", - testId="sorted-fa", - sort=True, + self.convert_txt_txt_sort( + "092-en-fa-alphabet-sample", + "092-en-fa-alphabet-sample-sorted-fa", + fnamePrefix="sort-locale/", + testId="sorted-fa-headword_lower", sortKeyName="headword_lower:fa_IR.UTF-8", ) def test_convert_sortLocale_fa_2(self): - name = "092-en-fa-alphabet-sample" - self.convert_sqlite_both( - f"sort-locale/{name}.txt", - f"{name}-sorted-latin-fa.txt", - compareText=f"sort-locale/{name}-sorted-latin-fa.txt", + self.convert_txt_txt_sort( + "092-en-fa-alphabet-sample", + "092-en-fa-alphabet-sample-sorted-latin-fa", + fnamePrefix="sort-locale/", testId="sorted-latin-fa", - sort=True, sortKeyName="headword_lower:fa-u-kr-latn-arab", ) + def test_convert_sortLocale_fa_3(self): + self.convert_txt_txt_sort( + "100-en-fa", + "100-en-fa-sort-headword-fa", + testId="sorted-fa-headword", + sortKeyName="headword:fa", + ) + if __name__ == "__main__": unittest.main() diff --git a/tests/stardict_test.py b/tests/stardict_test.py index 680596611..add851522 100644 --- a/tests/stardict_test.py +++ b/tests/stardict_test.py @@ -2,13 +2,14 @@ import random import unittest from functools import cmp_to_key +from typing import Any -def toBytes(s): +def toBytes(s: str | bytes) -> bytes: return bytes(s, "utf-8") if isinstance(s, str) else bytes(s) -def sortKeyBytes(ba: bytes): +def sortKeyBytes(ba: bytes) -> Any: assert isinstance(ba, bytes) # ba.lower() + ba is wrong return ( @@ -17,9 +18,9 @@ def sortKeyBytes(ba: bytes): ) -def stardictStrCmp(s1, s2): +def stardictStrCmp(s1: str, s2: str) -> int: """ - use this function to sort index items in StarDict dictionary + Use this function to sort index items in StarDict dictionary s1 and s2 must be utf-8 encoded strings. """ s1 = toBytes(s1) @@ -34,7 +35,7 @@ def stardictStrCmp(s1, s2): sortKeyOld = cmp_to_key(stardictStrCmp) # TOO SLOW -def asciiStrCaseCmp(ba1, ba2): +def asciiStrCaseCmp(ba1: bytes, ba2: bytes) -> int: """ ba1 and ba2 are instances of bytes imitate g_ascii_strcasecmp function of glib library gstrfuncs.c file. @@ -48,7 +49,7 @@ def asciiStrCaseCmp(ba1, ba2): return len(ba1) - len(ba2) -def strCmp(ba1, ba2): +def strCmp(ba1: bytes, ba2: bytes) -> int: """ ba1 and ba2 are instances of bytes imitate strcmp of standard C library. @@ -70,27 +71,22 @@ def strCmp(ba1, ba2): return len(ba1) - len(ba2) -def isAsciiAlpha(c): - """C is int.""" +def isAsciiAlpha(c: int) -> bool: return ord("A") <= c <= ord("Z") or ord("a") <= c <= ord("z") -def isAsciiLower(c): +def isAsciiLower(c: int) -> bool: return ord("a") <= c <= ord("z") -def isAsciiUpper(c): - """ - c is int - imitate ISUPPER macro of glib library gstrfuncs.c file. - """ +def isAsciiUpper(c: int) -> bool: + """Imitate ISUPPER macro of glib library gstrfuncs.c file.""" return ord("A") <= c <= ord("Z") -def asciiLower(c): +def asciiLower(c: int) -> int: """ - c is int - returns int (ascii character code). + Returns int (ascii character code). imitate TOLOWER macro of glib library gstrfuncs.c file @@ -111,7 +107,7 @@ def asciiLower(c): return c - ord("A") + ord("a") if isAsciiUpper(c) else c -def getRandomBytes(avgLen, sigma): +def getRandomBytes(avgLen: float, sigma: float) -> bytes: length = round(random.gauss(avgLen, sigma)) return bytes([random.choice(range(256)) for _ in range(length)]) diff --git a/whitelist.py b/whitelist.py index d9538a688..baa9b42e6 100644 --- a/whitelist.py +++ b/whitelist.py @@ -94,7 +94,6 @@ _.gzipEndOffset # unused attribute (pyglossary/plugins/babylon_bgl/bgl_reader_debug.py:42) _.gzipStartOffset # unused attribute (pyglossary/plugins/babylon_bgl/bgl_reader_debug.py:363) _.gzipStartOffset # unused attribute (pyglossary/plugins/babylon_bgl/bgl_reader_debug.py:41) -_.innerXML # unused method (pyglossary/plugins/iupac_goldbook.py:160) _.isatty # unused method (pyglossary/io_utils.py:26) _.isatty # unused method (pyglossary/io_utils.py:98) _.isatty # unused method (pyglossary/slob.py:294)