From a20515ba8f030f7ae7d5cee62a8db9ec31254427 Mon Sep 17 00:00:00 2001
From: Simon Cozens <simon@simon-cozens.org>
Date: Wed, 11 Sep 2024 20:21:04 +0100
Subject: [PATCH] Modern Python, self-update, better CLI

---
 README.rst => README.md   | 16 +++----
 lib/youseedee/__init__.py | 88 ++++++++++++++++++++++++++++++++-------
 lib/youseedee/__main__.py | 55 ++++++++++++++++++++----
 pyproject.toml            | 35 ++++++++++++++++
 requirements.txt          |  2 -
 setup.py                  | 37 ----------------
 6 files changed, 163 insertions(+), 70 deletions(-)
 rename README.rst => README.md (72%)
 create mode 100644 pyproject.toml
 delete mode 100644 requirements.txt
 delete mode 100644 setup.py

diff --git a/README.rst b/README.md
similarity index 72%
rename from README.rst
rename to README.md
index 6fb07fd..dae04fe 100644
--- a/README.rst
+++ b/README.md
@@ -1,8 +1,7 @@
-youseedee: interface to the UCD
--------------------------------
+# youseedee: interface to the UCD"
 
-This module allows you to query the Unicode Character Database. The main function
-to be imported is ``ucd_data``::
+This module allows you to query the Unicode Character Database. The main
+function to be imported is `ucd_data`:
 
     >>> ucd_data(0x078A)
     {'Age': '3.0',
@@ -14,10 +13,11 @@ to be imported is ``ucd_data``::
      'Name': 'THAANA LETTER FAAFU',
      'Script': 'Thaana'}
 
-On first run, it will download the database files for you from unicode.org.
-These are stored in ``.youseedee`` in your home directory.
+On first run, it will download the database files for you from
+unicode.org. These are stored in `.youseedee` in your home directory.
+These are also updated if new data is available from unicode.org
 
-You may also use it as a command line utility::
+You may also use it as a command line utility:
 
     $ python3 -m youseedee 0x078A
     {'Age': '3.0',
@@ -39,4 +39,4 @@ You may also use it as a command line utility::
      'General_Category': 'Lu',
      'Line_Break': 'AL',
      'Name': 'LATIN CAPITAL LETTER K WITH CARON',
-     'Script': 'Latin'}
\ No newline at end of file
+     'Script': 'Latin'}
diff --git a/lib/youseedee/__init__.py b/lib/youseedee/__init__.py
index 507e4ac..8db9c5f 100644
--- a/lib/youseedee/__init__.py
+++ b/lib/youseedee/__init__.py
@@ -1,35 +1,44 @@
-import zipfile
-from os.path import expanduser
+"""Python interface to the Unicode Character Database"""
+
+import bisect
+import csv
+import datetime
+import logging
 import os
-import requests
-import sys
 import re
-import csv
-import bisect
+import sys
+import time
+import zipfile
+from os.path import expanduser
 
+import requests
 from filelock import FileLock
 
+log = logging.getLogger(__name__)
+
 try:
     from tqdm import tqdm
 
     wrapattr = tqdm.wrapattr
 except ImportError:
-    wrapattr = lambda x, y, **kwargs: x
+
+    def wrapattr(x, _y, **_kwargs):
+        return x
 
 
 def bisect_key(haystack, needle, key):
     if sys.version_info[0:2] >= (3, 10):
         return bisect.bisect_right(haystack, needle, key=key)
-    else:
-        haystack = [key(h) for h in haystack]
-        return bisect.bisect_right(haystack, needle)
+    haystack = [key(h) for h in haystack]
+    return bisect.bisect_right(haystack, needle)
 
 
 UCD_URL = "https://unicode.org/Public/UCD/latest/ucd/UCD.zip"
 
 
 def ucd_dir():
-    ucddir = os.path.expanduser("~/.youseedee")
+    """Return the directory where Unicode data is stored"""
+    ucddir = expanduser("~/.youseedee")
     try:
         os.mkdir(ucddir)
     except FileExistsError:
@@ -37,15 +46,44 @@ def ucd_dir():
     return ucddir
 
 
+def up_to_date():
+    """Check if the Unicode data is up to date"""
+    data_date = os.path.getmtime(os.path.join(ucd_dir(), "UnicodeData.txt"))
+    # OK if it's less than three months old
+    if time.time() - data_date < 60 * 60 * 24 * 30 * 3:
+        log.debug("Youseedee data is less than three months old")
+        return True
+    # Let's check if Unicode has anything newer:
+    response = requests.head(UCD_URL, timeout=5)
+    if "Last-Modified" not in response.headers:
+        log.warning("Could not detect when Unicode last updated, updating anyway")
+        return False
+    last_modified = response.headers["Last-Modified"]
+    available = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
+    return available.timestamp() < data_date
+
+
 def ensure_files():
-    if os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")):
-        return
+    """Ensure the Unicode data files are downloaded and up to date, and download them if not"""
+    if not os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")):
+        download_files()
+    if not up_to_date():
+        # Remove the zip if it exists
+        zip_path = os.path.join(ucd_dir(), "UCD.zip")
+        if os.path.isfile(zip_path):
+            os.unlink(zip_path)
+        download_files()
+    return
+
 
+def download_files():
+    """Download the Unicode Character Database files"""
     zip_path = os.path.join(ucd_dir(), "UCD.zip")
     lock = FileLock(zip_path + ".lock")
     with lock:
         if not os.path.isfile(zip_path):
-            response = requests.get(UCD_URL, stream=True)
+            log.info("Downloading Unicode Character Database")
+            response = requests.get(UCD_URL, stream=True, timeout=1000)
             with wrapattr(
                 open(zip_path, "wb"),
                 "write",
@@ -64,9 +102,10 @@ def ensure_files():
 
 
 def parse_file_ranges(filename):
+    """Parse a Unicode file with ranges, such as `Blocks.txt`"""
     ensure_files()
     ranges = []
-    with open(os.path.join(ucd_dir(), filename), "r") as f:
+    with open(os.path.join(ucd_dir(), filename), "r", encoding="utf-8") as f:
         for line in f:
             if re.match(r"^\s*#", line):
                 continue
@@ -84,9 +123,12 @@ def parse_file_ranges(filename):
 
 
 def parse_file_semicolonsep(filename):
+    """Parse a semi-colon separated Unicode file, such as `UnicodeData.txt`"""
     ensure_files()
     data = {}
-    with open(os.path.join(ucd_dir(), filename), "r", newline="") as f:
+    with open(
+        os.path.join(ucd_dir(), filename), "r", newline="", encoding="utf-8"
+    ) as f:
         reader = csv.reader(f, delimiter=";", skipinitialspace=True)
         for row in reader:
             if len(row) < 2:
@@ -100,11 +142,19 @@ def parse_file_semicolonsep(filename):
 
 
 def parsed_unicode_file(filename):
+    """Return the parsed data for a given Unicode file
+
+    This function will parse the file if it hasn't been parsed yet,
+    and return the parsed data. The filename is the full filename
+    from the zip file. e.g. `ArabicShaping.txt`. The data is stored
+    in a singleton dictionary, so it will only be parsed once.
+    """
     fileentry = database[filename]
     if "data" in fileentry:
         return fileentry["data"]
     data = fileentry["reader"](filename)
     # Things we will bisect need to be sorted
+    # pylint: disable=comparison-with-callable
     if fileentry["datareader"] == rangereader:
         data = sorted(data, key=lambda x: x[0])
     fileentry["data"] = data
@@ -235,6 +285,12 @@ def rangereader(filename, codepoint):
 
 
 def ucd_data(codepoint):
+    """Return a dictionary of Unicode data for a given codepoint
+
+    This is the main function of the module. It will return a dictionary
+    of Unicode data for a given codepoint. The codepoint is expected to
+    be an integer.
+    """
     out = {}
     for file, props in database.items():
         out.update(props["datareader"](file, codepoint))
diff --git a/lib/youseedee/__main__.py b/lib/youseedee/__main__.py
index c5d0e33..5b439ba 100644
--- a/lib/youseedee/__main__.py
+++ b/lib/youseedee/__main__.py
@@ -1,9 +1,50 @@
-from youseedee import ucd_data
-from pprint import pprint
+import argparse
 import sys
 
-char = sys.argv[1]
-if len(char) > 1:
-  pprint(ucd_data(int(char,16)))
-else:
-  pprint(ucd_data(ord(char)))
\ No newline at end of file
+from youseedee import ucd_data, download_files
+
+
+def main(args=None):
+    parser = argparse.ArgumentParser(description="Get Unicode Character Data")
+    parser.add_argument(
+        "--force-download",
+        action="store_true",
+        help="Force download of latest Unicode data",
+    )
+    parser.add_argument(
+        "char",
+        type=str,
+        help="The character to get data for (either hex codepoint or character)",
+    )
+
+    args = parser.parse_args(args)
+    if args.force_download:
+        download_files()
+    char = sys.argv[1]
+    if len(char) > 1:
+        try:
+            if (
+                char.startswith("U+")
+                or char.startswith("u+")
+                or char.startswith("0x")
+                or char.startswith("0X")
+            ):
+                codepoint = int(char[2:], 16)
+            else:
+                codepoint = int(char, 16)
+        except ValueError:
+            print("Could not understand codepoint " + char)
+            sys.exit(1)
+    else:
+        codepoint = ord(char)
+    data = ucd_data(codepoint)
+
+    print(f"\nCharacter data for '{chr(codepoint)}' (U+{codepoint:04X}, {codepoint})\n")
+
+    for key, value in data.items():
+        key = key.replace("_", " ")
+        print(f"{key:40} {value}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..8bfc980
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,35 @@
+[project]
+dynamic = ["version"]
+
+name = "youseedee"
+description = "Interface to the latest version of the Unicode Character Database"
+
+license = { file = "README.md" }
+
+authors = [{ name = "Simon Cozens", email = "simon@simon-cozens.org" }]
+
+readme = { file = "README.md", content-type = "text/markdown" }
+
+keywords = []
+dependencies = ["requests", "filelock"]
+
+[project.urls]
+homepage = "https://pypi.org/project/youseedee"
+repository = "https://github.com/simoncozens/youseedee"
+
+[build-system]
+requires = ["setuptools>=74.1.0", "setuptools_scm[toml]>=8.1.0"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+where = ["lib"]
+
+[tool.setuptools_scm]
+git_describe_command = "git describe --match 'v*' --tags"
+
+[project.scripts]
+
+youseedee = "youseedee.__main__:main"
+
+[tool.pylint."messages control"]
+max-line-length = 120
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 10e5cc0..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-requests
-filelock
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 757b10e..0000000
--- a/setup.py
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-from setuptools import setup, find_packages
-import os
-import glob
-
-thelibFolder = os.path.dirname(os.path.realpath(__file__))
-requirementPath = thelibFolder + '/requirements.txt'
-install_requires = []
-if os.path.isfile(requirementPath):
-    with open(requirementPath) as f:
-        install_requires = f.read().splitlines()
-
-config = {
-    'name': 'youseedee',
-    'author': 'Simon Cozens',
-    'author_email': 'simon@simon-cozens.org',
-    'url': 'https://github.com/simoncozens/youseedee',
-    'description': 'Interface to the Unicode Character Database',
-    'long_description': open('README.rst', 'r').read(),
-    'license': 'MIT',
-    'version': '0.5.3',
-    'install_requires': install_requires,
-    'classifiers': [
-        "Programming Language :: Python",
-        "Programming Language :: Python :: 2",
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Development Status :: 4 - Beta"
-
-    ],
-    'package_dir': {'': 'lib'},
-    'packages': find_packages("lib"),
-}
-
-if __name__ == '__main__':
-    setup(**config)