Modern Python, self-update, better CLI

simoncozens · Sep 11, 2024 · a20515b · a20515b
1 parent a59788a
commit a20515b
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 70 deletions.
diff --git a/README.rst → README.md b/README.rst → README.md
@@ -1,8 +1,7 @@
-youseedee: interface to the UCD
--------------------------------
+# youseedee: interface to the UCD"
 
-This module allows you to query the Unicode Character Database. The main function
-to be imported is ``ucd_data``::
+This module allows you to query the Unicode Character Database. The main
+function to be imported is `ucd_data`:
 
     >>> ucd_data(0x078A)
     {'Age': '3.0',
@@ -14,10 +13,11 @@ to be imported is ``ucd_data``::
      'Name': 'THAANA LETTER FAAFU',
      'Script': 'Thaana'}
 
-On first run, it will download the database files for you from unicode.org.
-These are stored in ``.youseedee`` in your home directory.
+On first run, it will download the database files for you from
+unicode.org. These are stored in `.youseedee` in your home directory.
+These are also updated if new data is available from unicode.org
 
-You may also use it as a command line utility::
+You may also use it as a command line utility:
 
     $ python3 -m youseedee 0x078A
     {'Age': '3.0',
@@ -39,4 +39,4 @@ You may also use it as a command line utility::
      'General_Category': 'Lu',
      'Line_Break': 'AL',
      'Name': 'LATIN CAPITAL LETTER K WITH CARON',
-     'Script': 'Latin'}
+     'Script': 'Latin'}
diff --git a/lib/youseedee/__init__.py b/lib/youseedee/__init__.py
@@ -1,51 +1,89 @@
-import zipfile
-from os.path import expanduser
+"""Python interface to the Unicode Character Database"""
+
+import bisect
+import csv
+import datetime
+import logging
 import os
-import requests
-import sys
 import re
-import csv
-import bisect
+import sys
+import time
+import zipfile
+from os.path import expanduser
 
+import requests
 from filelock import FileLock
 
+log = logging.getLogger(__name__)
+
 try:
     from tqdm import tqdm
 
     wrapattr = tqdm.wrapattr
 except ImportError:
-    wrapattr = lambda x, y, **kwargs: x
+
+    def wrapattr(x, _y, **_kwargs):
+        return x
 
 
 def bisect_key(haystack, needle, key):
     if sys.version_info[0:2] >= (3, 10):
         return bisect.bisect_right(haystack, needle, key=key)
-    else:
-        haystack = [key(h) for h in haystack]
-        return bisect.bisect_right(haystack, needle)
+    haystack = [key(h) for h in haystack]
+    return bisect.bisect_right(haystack, needle)
 
 
 UCD_URL = "https://unicode.org/Public/UCD/latest/ucd/UCD.zip"
 
 
 def ucd_dir():
-    ucddir = os.path.expanduser("~/.youseedee")
+    """Return the directory where Unicode data is stored"""
+    ucddir = expanduser("~/.youseedee")
     try:
         os.mkdir(ucddir)
     except FileExistsError:
         pass
     return ucddir
 
 
+def up_to_date():
+    """Check if the Unicode data is up to date"""
+    data_date = os.path.getmtime(os.path.join(ucd_dir(), "UnicodeData.txt"))
+    # OK if it's less than three months old
+    if time.time() - data_date < 60 * 60 * 24 * 30 * 3:
+        log.debug("Youseedee data is less than three months old")
+        return True
+    # Let's check if Unicode has anything newer:
+    response = requests.head(UCD_URL, timeout=5)
+    if "Last-Modified" not in response.headers:
+        log.warning("Could not detect when Unicode last updated, updating anyway")
+        return False
+    last_modified = response.headers["Last-Modified"]
+    available = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
+    return available.timestamp() < data_date
+
+
 def ensure_files():
-    if os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")):
-        return
+    """Ensure the Unicode data files are downloaded and up to date, and download them if not"""
+    if not os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")):
+        download_files()
+    if not up_to_date():
+        # Remove the zip if it exists
+        zip_path = os.path.join(ucd_dir(), "UCD.zip")
+        if os.path.isfile(zip_path):
+            os.unlink(zip_path)
+        download_files()
+    return
+
 
+def download_files():
+    """Download the Unicode Character Database files"""
     zip_path = os.path.join(ucd_dir(), "UCD.zip")
     lock = FileLock(zip_path + ".lock")
     with lock:
         if not os.path.isfile(zip_path):
-            response = requests.get(UCD_URL, stream=True)
+            log.info("Downloading Unicode Character Database")
+            response = requests.get(UCD_URL, stream=True, timeout=1000)
             with wrapattr(
                 open(zip_path, "wb"),
                 "write",
@@ -64,9 +102,10 @@ def ensure_files():
 
 
 def parse_file_ranges(filename):
+    """Parse a Unicode file with ranges, such as `Blocks.txt`"""
     ensure_files()
     ranges = []
-    with open(os.path.join(ucd_dir(), filename), "r") as f:
+    with open(os.path.join(ucd_dir(), filename), "r", encoding="utf-8") as f:
         for line in f:
             if re.match(r"^\s*#", line):
                 continue
@@ -84,9 +123,12 @@ def parse_file_ranges(filename):
 
 
 def parse_file_semicolonsep(filename):
+    """Parse a semi-colon separated Unicode file, such as `UnicodeData.txt`"""
     ensure_files()
     data = {}
-    with open(os.path.join(ucd_dir(), filename), "r", newline="") as f:
+    with open(
+        os.path.join(ucd_dir(), filename), "r", newline="", encoding="utf-8"
+    ) as f:
         reader = csv.reader(f, delimiter=";", skipinitialspace=True)
         for row in reader:
             if len(row) < 2:
@@ -100,11 +142,19 @@ def parse_file_semicolonsep(filename):
 
 
 def parsed_unicode_file(filename):
+    """Return the parsed data for a given Unicode file
+
+    This function will parse the file if it hasn't been parsed yet,
+    and return the parsed data. The filename is the full filename
+    from the zip file. e.g. `ArabicShaping.txt`. The data is stored
+    in a singleton dictionary, so it will only be parsed once.
+    """
     fileentry = database[filename]
     if "data" in fileentry:
         return fileentry["data"]
     data = fileentry["reader"](filename)
     # Things we will bisect need to be sorted
+    # pylint: disable=comparison-with-callable
     if fileentry["datareader"] == rangereader:
         data = sorted(data, key=lambda x: x[0])
     fileentry["data"] = data
@@ -235,6 +285,12 @@ def rangereader(filename, codepoint):
 
 
 def ucd_data(codepoint):
+    """Return a dictionary of Unicode data for a given codepoint
+
+    This is the main function of the module. It will return a dictionary
+    of Unicode data for a given codepoint. The codepoint is expected to
+    be an integer.
+    """
     out = {}
     for file, props in database.items():
         out.update(props["datareader"](file, codepoint))

diff --git a/lib/youseedee/__main__.py b/lib/youseedee/__main__.py
@@ -1,9 +1,50 @@
-from youseedee import ucd_data
-from pprint import pprint
+import argparse
 import sys
 
-char = sys.argv[1]
-if len(char) > 1:
-  pprint(ucd_data(int(char,16)))
-else:
-  pprint(ucd_data(ord(char)))
+from youseedee import ucd_data, download_files
+
+
+def main(args=None):
+    parser = argparse.ArgumentParser(description="Get Unicode Character Data")
+    parser.add_argument(
+        "--force-download",
+        action="store_true",
+        help="Force download of latest Unicode data",
+    )
+    parser.add_argument(
+        "char",
+        type=str,
+        help="The character to get data for (either hex codepoint or character)",
+    )
+
+    args = parser.parse_args(args)
+    if args.force_download:
+        download_files()
+    char = sys.argv[1]
+    if len(char) > 1:
+        try:
+            if (
+                char.startswith("U+")
+                or char.startswith("u+")
+                or char.startswith("0x")
+                or char.startswith("0X")
+            ):
+                codepoint = int(char[2:], 16)
+            else:
+                codepoint = int(char, 16)
+        except ValueError:
+            print("Could not understand codepoint " + char)
+            sys.exit(1)
+    else:
+        codepoint = ord(char)
+    data = ucd_data(codepoint)
+
+    print(f"\nCharacter data for '{chr(codepoint)}' (U+{codepoint:04X}, {codepoint})\n")
+
+    for key, value in data.items():
+        key = key.replace("_", " ")
+        print(f"{key:40} {value}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,35 @@
+[project]
+dynamic = ["version"]
+
+name = "youseedee"
+description = "Interface to the latest version of the Unicode Character Database"
+
+license = { file = "README.md" }
+
+authors = [{ name = "Simon Cozens", email = "simon@simon-cozens.org" }]
+
+readme = { file = "README.md", content-type = "text/markdown" }
+
+keywords = []
+dependencies = ["requests", "filelock"]
+
+[project.urls]
+homepage = "https://pypi.org/project/youseedee"
+repository = "https://github.com/simoncozens/youseedee"
+
+[build-system]
+requires = ["setuptools>=74.1.0", "setuptools_scm[toml]>=8.1.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["lib"]
+
+[tool.setuptools_scm]
+git_describe_command = "git describe --match 'v*' --tags"
+
+[project.scripts]
+
+youseedee = "youseedee.__main__:main"
+
+[tool.pylint."messages control"]
+max-line-length = 120
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py