setup_openmoji.py

"""This script sets up the OpenMoji data used in the dobble package.

Typical usage example:

  >>> python setup_openmoji.py
"""


import json
import shutil
from collections import defaultdict
from pathlib import Path

import requests

from dobble import constants


# Constants controlling which tasks to perform
DOWNLOAD_EXTRACT_AND_ORGANIZE = False
FIND_DUPLICATES = False
REMOVE_DUPLICATES = True
RESTRUCTURE_JSON_FILE = True

# OpenMoji data
OPENMOJI_DIR = Path("data/openmoji")
OPENMOJI_JSON = OPENMOJI_DIR / "openmoji.json"

# OpenMoji emojis whose "annotations" parameter is not unique
DUPLICATES = [
    ("E319", "extras-openmoji"),       # brain
    ("E001", "extras-openmoji"),       # donkey
    ("E011", "extras-openmoji"),       # microbe
    ("E0C3", "extras-openmoji"),       # pretzel
    ("E1D1", "extras-openmoji"),       # keyboard
    ("E104", "extras-openmoji"),       # scroll
    ("E347", "extras-openmoji"),       # axe
    ("E269", "extras-openmoji"),       # link
    ("E204", "extras-openmoji"),       # elevator
    ("E216", "extras-openmoji"),       # moai
    ("1F3F3-FE0F", "extras-openmoji")  # white flag
]


def download_and_extract_openmoji_data() -> None:
    """Download and extract OpenMoji data from GitHub."""

    # Remove OpenMoji data directory if it exists
    if OPENMOJI_DIR.exists():
        shutil.rmtree(OPENMOJI_DIR)

    # Re-create OpenMoji data directory
    OPENMOJI_DIR.mkdir(parents=True)

    # URLs and corresponding file names of OpenMoji data
    urls = [
        constants.OPENMOJI_LICENSE_URL,
        constants.OPENMOJI_JSON_URL,
        constants.OPENMOJI_COLOR_URL,
        constants.OPENMOJI_BLACK_URL
    ]
    file_names = [
        "LICENSE.txt",
        "openmoji.json",
        "color.zip",
        "black.zip"
    ]
    paths = [OPENMOJI_DIR / file_name for file_name in file_names]

    # Download files and save to OpenMoji data directory
    for url, path in zip(urls, paths):
        response = requests.get(url, timeout=10)
        with open(path, mode="wb") as file:
            file.write(response.content)

        # Extract zip archives
        if path.suffix == ".zip":
            shutil.unpack_archive(path, extract_dir=path.with_suffix(""))
            path.unlink()


def organize_openmoji_data() -> None:
    """Organize OpenMoji emojis into subdirectories.

    This function organizes the OpenMoji emojis into subdirectories
    based on their "group" attribute in the JSON file provided by
    OpenMoji.
    """

    # Read JSON file provided by OpenMoji
    with OPENMOJI_JSON.open("r", encoding="utf-8") as json_file:
        openmoji_data = json.load(json_file)

    # Create subdirectory for each unique "group" value
    unique_groups = set(emoji["group"] for emoji in openmoji_data)
    for group in unique_groups:
        (OPENMOJI_DIR / "black" / group).mkdir(parents=True, exist_ok=True)
        (OPENMOJI_DIR / "color" / group).mkdir(parents=True, exist_ok=True)

    # Move emojis to their respective "group" directories
    for entry in openmoji_data:
        group = entry["group"]
        hexcode = entry["hexcode"]

        for mode in ["black", "color"]:
            emoji_fpath = OPENMOJI_DIR / mode / f"{hexcode}.png"
            if emoji_fpath.exists():
                target_dir = OPENMOJI_DIR / mode / group
                shutil.move(str(emoji_fpath), str(target_dir))


def find_duplicates() -> dict[str, list[dict[str, str]]]:
    """Find emojis with duplicate annotations.

    This function finds emojis from the OpenMoji dataset that have a
    non-unique "annotation" in the JSON file provided by OpenMoji.

    Returns:
        A dictionary of emojis from the OpenMoji dataset that have a
        non-unique "annotation" in the JSON file provided by OpenMoji.
        The dictionary is indexed by the "annotation" and contains a
        list of dictionaries of the emojis with that annotation.  Each
        dictionary contains the "hexcode" and "group" of the emoji.
    """

    # Read JSON file provided by OpenMoji
    with OPENMOJI_JSON.open("r", encoding="utf-8") as json_file:
        openmoji_data = json.load(json_file)

    # Count the number of appearances of each annotation
    annotation_counts = defaultdict(int)
    for entry in openmoji_data:
        annotation_counts[entry["annotation"]] += 1

    # Collect emojis with duplicate annotations
    duplicates = defaultdict(list)
    for entry in openmoji_data:
        if annotation_counts[entry["annotation"]] > 1:
            emoji_data = {
                "hexcode": entry["hexcode"],
                "group": entry["group"]
            }
            duplicates[entry["annotation"]].append(emoji_data)

    return duplicates


def print_duplicates(duplicates: dict[str, list[dict[str, str]]]) -> None:
    """Print emojis with duplicate annotations.

    This function prints emojis from the OpenMoji dataset that have a
    non-unique "annotation" in the JSON file provided by OpenMoji.  It
    prints the annotations along with the hexcodes and groups of all
    emojis with that annotation.

    Args:
        duplicates: A dictionary of emojis from the OpenMoji dataset
          that have a non-unique "annotation" in the JSON file provided
          by OpenMoji as returned by the ``find_duplicates`` function.
    """

    for annotation, emoji_data in duplicates.items():
        print(f"Annotation: {annotation}")
        for entry in emoji_data:
            print(f'\t("{entry["hexcode"]}", "{entry["group"]}")')
        print()


def remove_duplicates() -> None:
    """Remove OpenMoji emojis w/ non-unique "annotation" parameter.

    This function removes emojis from the OpenMoji dataset that have a
    non-unique "annotation" by deleting the emoji image as well as the
    corresponding entry in the JSON file.  Duplicates are identified via
    the ``DUPLICATES`` list.
    """

    for hexcode, group in DUPLICATES:
        # Construct file paths for both outline-only and color versions
        black_fpath = OPENMOJI_DIR / "black" / group / f"{hexcode}.png"
        color_fpath = OPENMOJI_DIR / "color" / group / f"{hexcode}.png"

        # Remove emoji images (outline-only and color) if they exist
        black_fpath.unlink(missing_ok=True)
        color_fpath.unlink(missing_ok=True)

    # Read JSON file provided by OpenMoji
    with OPENMOJI_JSON.open("r", encoding="utf-8") as json_file:
        openmoji_data = json.load(json_file)

    # Remove entries corresponding to duplicates
    openmoji_data = [
        entry for entry in openmoji_data if (entry["hexcode"], entry["group"]) not in DUPLICATES
    ]

    # Write updated data back to JSON file
    with OPENMOJI_JSON.open("w", encoding="utf-8") as json_file:
        json.dump(openmoji_data, json_file, indent=2, ensure_ascii=False)


def restructure_json_file() -> None:
    """Restructure OpenMoji JSON file.

    This function restructures the JSON file provided by OpenMoji in
    such a way that the JSON file contains a dictionary with the emoji
    "annotation" as keys and dictionaries containing the "hexcode",
    "group", and "subgroups" parameters as values.
    """

    # Read JSON file provided by OpenMoji
    with OPENMOJI_JSON.open("r", encoding="utf-8") as json_file:
        openmoji_data = json.load(json_file)

    # Create dictionary with desired format
    restructured_data = {}
    for entry in openmoji_data:
        key = entry["annotation"]
        restructured_data[key] = {
            "hexcode": entry["hexcode"],
            "group": entry["group"],
            "subgroup": entry["subgroups"]
        }

    # Write restructured data to JSON file
    restructured_json_fpath = OPENMOJI_DIR / "openmoji_restructured.json"
    with restructured_json_fpath.open("w", encoding="utf-8") as json_file:
        json.dump(restructured_data, json_file, indent=4)


if __name__ == "__main__":
    if DOWNLOAD_EXTRACT_AND_ORGANIZE:
        download_and_extract_openmoji_data()
        organize_openmoji_data()

    if FIND_DUPLICATES:
        duplicate_annotations = find_duplicates()
        print_duplicates(duplicate_annotations)

    if REMOVE_DUPLICATES:
        remove_duplicates()

    if RESTRUCTURE_JSON_FILE:
        restructure_json_file()