Skip to content

Commit

Permalink
Modern Python, self-update, better CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
simoncozens committed Sep 11, 2024
1 parent a59788a commit a20515b
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 70 deletions.
16 changes: 8 additions & 8 deletions README.rst → README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
youseedee: interface to the UCD
-------------------------------
# youseedee: interface to the UCD"

This module allows you to query the Unicode Character Database. The main function
to be imported is ``ucd_data``::
This module allows you to query the Unicode Character Database. The main
function to be imported is `ucd_data`:

>>> ucd_data(0x078A)
{'Age': '3.0',
Expand All @@ -14,10 +13,11 @@ to be imported is ``ucd_data``::
'Name': 'THAANA LETTER FAAFU',
'Script': 'Thaana'}

On first run, it will download the database files for you from unicode.org.
These are stored in ``.youseedee`` in your home directory.
On first run, it will download the database files for you from
unicode.org. These are stored in `.youseedee` in your home directory.
These are also updated if new data is available from unicode.org

You may also use it as a command line utility::
You may also use it as a command line utility:

$ python3 -m youseedee 0x078A
{'Age': '3.0',
Expand All @@ -39,4 +39,4 @@ You may also use it as a command line utility::
'General_Category': 'Lu',
'Line_Break': 'AL',
'Name': 'LATIN CAPITAL LETTER K WITH CARON',
'Script': 'Latin'}
'Script': 'Latin'}
88 changes: 72 additions & 16 deletions lib/youseedee/__init__.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,89 @@
import zipfile
from os.path import expanduser
"""Python interface to the Unicode Character Database"""

import bisect
import csv
import datetime
import logging
import os
import requests
import sys
import re
import csv
import bisect
import sys
import time
import zipfile
from os.path import expanduser

import requests
from filelock import FileLock

log = logging.getLogger(__name__)

try:
from tqdm import tqdm

wrapattr = tqdm.wrapattr
except ImportError:
wrapattr = lambda x, y, **kwargs: x

def wrapattr(x, _y, **_kwargs):
return x


def bisect_key(haystack, needle, key):
if sys.version_info[0:2] >= (3, 10):
return bisect.bisect_right(haystack, needle, key=key)
else:
haystack = [key(h) for h in haystack]
return bisect.bisect_right(haystack, needle)
haystack = [key(h) for h in haystack]
return bisect.bisect_right(haystack, needle)


UCD_URL = "https://unicode.org/Public/UCD/latest/ucd/UCD.zip"


def ucd_dir():
ucddir = os.path.expanduser("~/.youseedee")
"""Return the directory where Unicode data is stored"""
ucddir = expanduser("~/.youseedee")
try:
os.mkdir(ucddir)
except FileExistsError:
pass
return ucddir


def up_to_date():
"""Check if the Unicode data is up to date"""
data_date = os.path.getmtime(os.path.join(ucd_dir(), "UnicodeData.txt"))
# OK if it's less than three months old
if time.time() - data_date < 60 * 60 * 24 * 30 * 3:
log.debug("Youseedee data is less than three months old")
return True
# Let's check if Unicode has anything newer:
response = requests.head(UCD_URL, timeout=5)
if "Last-Modified" not in response.headers:
log.warning("Could not detect when Unicode last updated, updating anyway")
return False
last_modified = response.headers["Last-Modified"]
available = datetime.datetime.strptime(last_modified, "%a, %d %b %Y %H:%M:%S %Z")
return available.timestamp() < data_date


def ensure_files():
if os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")):
return
"""Ensure the Unicode data files are downloaded and up to date, and download them if not"""
if not os.path.isfile(os.path.join(ucd_dir(), "UnicodeData.txt")):
download_files()
if not up_to_date():
# Remove the zip if it exists
zip_path = os.path.join(ucd_dir(), "UCD.zip")
if os.path.isfile(zip_path):
os.unlink(zip_path)
download_files()
return


def download_files():
"""Download the Unicode Character Database files"""
zip_path = os.path.join(ucd_dir(), "UCD.zip")
lock = FileLock(zip_path + ".lock")
with lock:
if not os.path.isfile(zip_path):
response = requests.get(UCD_URL, stream=True)
log.info("Downloading Unicode Character Database")
response = requests.get(UCD_URL, stream=True, timeout=1000)
with wrapattr(
open(zip_path, "wb"),
"write",
Expand All @@ -64,9 +102,10 @@ def ensure_files():


def parse_file_ranges(filename):
"""Parse a Unicode file with ranges, such as `Blocks.txt`"""
ensure_files()
ranges = []
with open(os.path.join(ucd_dir(), filename), "r") as f:
with open(os.path.join(ucd_dir(), filename), "r", encoding="utf-8") as f:
for line in f:
if re.match(r"^\s*#", line):
continue
Expand All @@ -84,9 +123,12 @@ def parse_file_ranges(filename):


def parse_file_semicolonsep(filename):
"""Parse a semi-colon separated Unicode file, such as `UnicodeData.txt`"""
ensure_files()
data = {}
with open(os.path.join(ucd_dir(), filename), "r", newline="") as f:
with open(
os.path.join(ucd_dir(), filename), "r", newline="", encoding="utf-8"
) as f:
reader = csv.reader(f, delimiter=";", skipinitialspace=True)
for row in reader:
if len(row) < 2:
Expand All @@ -100,11 +142,19 @@ def parse_file_semicolonsep(filename):


def parsed_unicode_file(filename):
"""Return the parsed data for a given Unicode file
This function will parse the file if it hasn't been parsed yet,
and return the parsed data. The filename is the full filename
from the zip file. e.g. `ArabicShaping.txt`. The data is stored
in a singleton dictionary, so it will only be parsed once.
"""
fileentry = database[filename]
if "data" in fileentry:
return fileentry["data"]
data = fileentry["reader"](filename)
# Things we will bisect need to be sorted
# pylint: disable=comparison-with-callable
if fileentry["datareader"] == rangereader:
data = sorted(data, key=lambda x: x[0])
fileentry["data"] = data
Expand Down Expand Up @@ -235,6 +285,12 @@ def rangereader(filename, codepoint):


def ucd_data(codepoint):
"""Return a dictionary of Unicode data for a given codepoint
This is the main function of the module. It will return a dictionary
of Unicode data for a given codepoint. The codepoint is expected to
be an integer.
"""
out = {}
for file, props in database.items():
out.update(props["datareader"](file, codepoint))
Expand Down
55 changes: 48 additions & 7 deletions lib/youseedee/__main__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,50 @@
from youseedee import ucd_data
from pprint import pprint
import argparse
import sys

char = sys.argv[1]
if len(char) > 1:
pprint(ucd_data(int(char,16)))
else:
pprint(ucd_data(ord(char)))
from youseedee import ucd_data, download_files


def main(args=None):
parser = argparse.ArgumentParser(description="Get Unicode Character Data")
parser.add_argument(
"--force-download",
action="store_true",
help="Force download of latest Unicode data",
)
parser.add_argument(
"char",
type=str,
help="The character to get data for (either hex codepoint or character)",
)

args = parser.parse_args(args)
if args.force_download:
download_files()
char = sys.argv[1]
if len(char) > 1:
try:
if (
char.startswith("U+")
or char.startswith("u+")
or char.startswith("0x")
or char.startswith("0X")
):
codepoint = int(char[2:], 16)
else:
codepoint = int(char, 16)
except ValueError:
print("Could not understand codepoint " + char)
sys.exit(1)
else:
codepoint = ord(char)
data = ucd_data(codepoint)

print(f"\nCharacter data for '{chr(codepoint)}' (U+{codepoint:04X}, {codepoint})\n")

for key, value in data.items():
key = key.replace("_", " ")
print(f"{key:40} {value}")


if __name__ == "__main__":
main()
35 changes: 35 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[project]
dynamic = ["version"]

name = "youseedee"
description = "Interface to the latest version of the Unicode Character Database"

license = { file = "README.md" }

authors = [{ name = "Simon Cozens", email = "simon@simon-cozens.org" }]

readme = { file = "README.md", content-type = "text/markdown" }

keywords = []
dependencies = ["requests", "filelock"]

[project.urls]
homepage = "https://pypi.org/project/youseedee"
repository = "https://github.com/simoncozens/youseedee"

[build-system]
requires = ["setuptools>=74.1.0", "setuptools_scm[toml]>=8.1.0"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
where = ["lib"]

[tool.setuptools_scm]
git_describe_command = "git describe --match 'v*' --tags"

[project.scripts]

youseedee = "youseedee.__main__:main"

[tool.pylint."messages control"]
max-line-length = 120
2 changes: 0 additions & 2 deletions requirements.txt

This file was deleted.

37 changes: 0 additions & 37 deletions setup.py

This file was deleted.

0 comments on commit a20515b

Please sign in to comment.