From a20515ba8f030f7ae7d5cee62a8db9ec31254427 Mon Sep 17 00:00:00 2001 From: Simon Cozens Date: Wed, 11 Sep 2024 20:21:04 +0100 Subject: [PATCH] Modern Python, self-update, better CLI --- README.rst => README.md | 16 +++---- lib/youseedee/__init__.py | 88 ++++++++++++++++++++++++++++++++------- lib/youseedee/__main__.py | 55 ++++++++++++++++++++---- pyproject.toml | 35 ++++++++++++++++ requirements.txt | 2 - setup.py | 37 ---------------- 6 files changed, 163 insertions(+), 70 deletions(-) rename README.rst => README.md (72%) create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/README.rst b/README.md similarity index 72% rename from README.rst rename to README.md index 6fb07fd..dae04fe 100644 --- a/README.rst +++ b/README.md @@ -1,8 +1,7 @@ -youseedee: interface to the UCD -------------------------------- +# youseedee: interface to the UCD" -This module allows you to query the Unicode Character Database. The main function -to be imported is ``ucd_data``:: +This module allows you to query the Unicode Character Database. The main +function to be imported is `ucd_data`: >>> ucd_data(0x078A) {'Age': '3.0', @@ -14,10 +13,11 @@ to be imported is ``ucd_data``:: 'Name': 'THAANA LETTER FAAFU', 'Script': 'Thaana'} -On first run, it will download the database files for you from unicode.org. -These are stored in ``.youseedee`` in your home directory. +On first run, it will download the database files for you from +unicode.org. These are stored in `.youseedee` in your home directory. +These are also updated if new data is available from unicode.org -You may also use it as a command line utility:: +You may also use it as a command line utility: $ python3 -m youseedee 0x078A {'Age': '3.0', @@ -39,4 +39,4 @@ You may also use it as a command line utility:: 'General_Category': 'Lu', 'Line_Break': 'AL', 'Name': 'LATIN CAPITAL LETTER K WITH CARON', - 'Script': 'Latin'} \ No newline at end of file + 'Script': 'Latin'} diff --git a/lib/youseedee/__init__.py b/lib/youseedee/__init__.py index 507e4ac..8db9c5f 100644 --- a/lib/youseedee/__init__.py +++ b/lib/youseedee/__init__.py @@ -1,35 +1,44 @@ -import zipfile -from os.path import expanduser +"""Python interface to the Unicode Character Database""" + +import bisect +import csv +import datetime +import logging import os -import requests -import sys import re -import csv -import bisect +import sys +import time +import zipfile +from os.path import expanduser +import requests from filelock import FileLock +log = logging.getLogger(__name__) + try: from tqdm import tqdm wrapattr = tqdm.wrapattr except ImportError: - wrapattr = lambda x, y, **kwargs: x + + def wrapattr(x, _y, **_kwargs): + return x def bisect_key(haystack, needle, key): if sys.version_info[0:2] >= (3, 10): return bisect.bisect_right(haystack, needle, key=key) - else: - haystack = [key(h) for h in haystack] - return bisect.bisect_right(haystack, needle) + haystack = [key(h) for h in haystack] + return bisect.bisect_right(haystack, needle) UCD_URL = "https://unicode.org/Public/UCD/latest/ucd/UCD.zip" def ucd_dir(): - ucddir = os.path.expanduser("~/.youseedee") + """Return the directory where Unicode data is stored""" + ucddir = expanduser("~/.youseedee") try: os.mkdir(ucddir) except FileExistsError: @@ -37,15 +46,44 @@ def ucd_dir(): return ucddir +def up_to_date(): + """Check if the Unicode data is up to date""" + data_date = os.path.getmtime(os.path.join(ucd_dir(), "UnicodeData.txt")) + # OK if it's less than three months old + if time.time() - data_date < 60 * 60 * 24 * 30 * 3: + log.debug("Youseedee data is less than three months old") + return True + # Let's check if Unicode has anything newer: + response = requests.head(UCD_URL, timeout=5) + if "Last-Modified" not in response.headers: + log.warning("Could not detect when Unicode last updated, updating anyway") + return False + last_modified = response.headers["Last-Modified"] + available = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z") + return available.timestamp() < data_date + + def ensure_files(): - if os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")): - return + """Ensure the Unicode data files are downloaded and up to date, and download them if not""" + if not os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")): + download_files() + if not up_to_date(): + # Remove the zip if it exists + zip_path = os.path.join(ucd_dir(), "UCD.zip") + if os.path.isfile(zip_path): + os.unlink(zip_path) + download_files() + return + +def download_files(): + """Download the Unicode Character Database files""" zip_path = os.path.join(ucd_dir(), "UCD.zip") lock = FileLock(zip_path + ".lock") with lock: if not os.path.isfile(zip_path): - response = requests.get(UCD_URL, stream=True) + log.info("Downloading Unicode Character Database") + response = requests.get(UCD_URL, stream=True, timeout=1000) with wrapattr( open(zip_path, "wb"), "write", @@ -64,9 +102,10 @@ def ensure_files(): def parse_file_ranges(filename): + """Parse a Unicode file with ranges, such as `Blocks.txt`""" ensure_files() ranges = [] - with open(os.path.join(ucd_dir(), filename), "r") as f: + with open(os.path.join(ucd_dir(), filename), "r", encoding="utf-8") as f: for line in f: if re.match(r"^\s*#", line): continue @@ -84,9 +123,12 @@ def parse_file_ranges(filename): def parse_file_semicolonsep(filename): + """Parse a semi-colon separated Unicode file, such as `UnicodeData.txt`""" ensure_files() data = {} - with open(os.path.join(ucd_dir(), filename), "r", newline="") as f: + with open( + os.path.join(ucd_dir(), filename), "r", newline="", encoding="utf-8" + ) as f: reader = csv.reader(f, delimiter=";", skipinitialspace=True) for row in reader: if len(row) < 2: @@ -100,11 +142,19 @@ def parse_file_semicolonsep(filename): def parsed_unicode_file(filename): + """Return the parsed data for a given Unicode file + + This function will parse the file if it hasn't been parsed yet, + and return the parsed data. The filename is the full filename + from the zip file. e.g. `ArabicShaping.txt`. The data is stored + in a singleton dictionary, so it will only be parsed once. + """ fileentry = database[filename] if "data" in fileentry: return fileentry["data"] data = fileentry["reader"](filename) # Things we will bisect need to be sorted + # pylint: disable=comparison-with-callable if fileentry["datareader"] == rangereader: data = sorted(data, key=lambda x: x[0]) fileentry["data"] = data @@ -235,6 +285,12 @@ def rangereader(filename, codepoint): def ucd_data(codepoint): + """Return a dictionary of Unicode data for a given codepoint + + This is the main function of the module. It will return a dictionary + of Unicode data for a given codepoint. The codepoint is expected to + be an integer. + """ out = {} for file, props in database.items(): out.update(props["datareader"](file, codepoint)) diff --git a/lib/youseedee/__main__.py b/lib/youseedee/__main__.py index c5d0e33..5b439ba 100644 --- a/lib/youseedee/__main__.py +++ b/lib/youseedee/__main__.py @@ -1,9 +1,50 @@ -from youseedee import ucd_data -from pprint import pprint +import argparse import sys -char = sys.argv[1] -if len(char) > 1: - pprint(ucd_data(int(char,16))) -else: - pprint(ucd_data(ord(char))) \ No newline at end of file +from youseedee import ucd_data, download_files + + +def main(args=None): + parser = argparse.ArgumentParser(description="Get Unicode Character Data") + parser.add_argument( + "--force-download", + action="store_true", + help="Force download of latest Unicode data", + ) + parser.add_argument( + "char", + type=str, + help="The character to get data for (either hex codepoint or character)", + ) + + args = parser.parse_args(args) + if args.force_download: + download_files() + char = sys.argv[1] + if len(char) > 1: + try: + if ( + char.startswith("U+") + or char.startswith("u+") + or char.startswith("0x") + or char.startswith("0X") + ): + codepoint = int(char[2:], 16) + else: + codepoint = int(char, 16) + except ValueError: + print("Could not understand codepoint " + char) + sys.exit(1) + else: + codepoint = ord(char) + data = ucd_data(codepoint) + + print(f"\nCharacter data for '{chr(codepoint)}' (U+{codepoint:04X}, {codepoint})\n") + + for key, value in data.items(): + key = key.replace("_", " ") + print(f"{key:40} {value}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..8bfc980 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,35 @@ +[project] +dynamic = ["version"] + +name = "youseedee" +description = "Interface to the latest version of the Unicode Character Database" + +license = { file = "README.md" } + +authors = [{ name = "Simon Cozens", email = "simon@simon-cozens.org" }] + +readme = { file = "README.md", content-type = "text/markdown" } + +keywords = [] +dependencies = ["requests", "filelock"] + +[project.urls] +homepage = "https://pypi.org/project/youseedee" +repository = "https://github.com/simoncozens/youseedee" + +[build-system] +requires = ["setuptools>=74.1.0", "setuptools_scm[toml]>=8.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +where = ["lib"] + +[tool.setuptools_scm] +git_describe_command = "git describe --match 'v*' --tags" + +[project.scripts] + +youseedee = "youseedee.__main__:main" + +[tool.pylint."messages control"] +max-line-length = 120 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 10e5cc0..0000000 --- a/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -requests -filelock diff --git a/setup.py b/setup.py deleted file mode 100644 index 757b10e..0000000 --- a/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -from setuptools import setup, find_packages -import os -import glob - -thelibFolder = os.path.dirname(os.path.realpath(__file__)) -requirementPath = thelibFolder + '/requirements.txt' -install_requires = [] -if os.path.isfile(requirementPath): - with open(requirementPath) as f: - install_requires = f.read().splitlines() - -config = { - 'name': 'youseedee', - 'author': 'Simon Cozens', - 'author_email': 'simon@simon-cozens.org', - 'url': 'https://github.com/simoncozens/youseedee', - 'description': 'Interface to the Unicode Character Database', - 'long_description': open('README.rst', 'r').read(), - 'license': 'MIT', - 'version': '0.5.3', - 'install_requires': install_requires, - 'classifiers': [ - "Programming Language :: Python", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Development Status :: 4 - Beta" - - ], - 'package_dir': {'': 'lib'}, - 'packages': find_packages("lib"), -} - -if __name__ == '__main__': - setup(**config)