Move dataset interactions as submodules

- Visualize the batch in 'visualize.ipnyb' - Adjusting brightness and contrast requites clipping - Remove obsolete data in 'missing_cards.json'
Acbarakat · Aug 20, 2023 · 3a1338e · 3a1338e
1 parent 8487b45
commit 3a1338e
Show file tree

Hide file tree

Showing 6 changed files with 411 additions and 102 deletions.
diff --git a/.gitignore b/.gitignore
@@ -129,8 +129,7 @@ dmypy.json
 .pyre/
 
 # CrystalVision
-data/
-src/data/
+/data/
 img/
 thumb/
 frozen_models/

diff --git a/src/data/__init__.py b/src/data/__init__.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+"""
+Methods to work with test/train/validation data.
+
+Todo:
+    * ???
+
+"""
+import os
+
+SRC_DIR = os.path.join(os.path.dirname(__file__), "..")
+DATA_DIR = os.path.join(SRC_DIR, "..", "data")
+
+CARD_API_FILEPATH = os.path.join(DATA_DIR, "cards.json")
+MISSING_CARDS_FILEPATH = os.path.join(SRC_DIR, "missing_cards.json")
diff --git a/src/data/dataset.py b/src/data/dataset.py
@@ -0,0 +1,154 @@
+# -*- coding: utf-8 -*-
+import json
+from typing import Tuple
+
+import pandas as pd
+import tensorflow as tf
+from keras import layers
+
+try:
+    from __init__ import CARD_API_FILEPATH
+except ModuleNotFoundError:
+    from data import CARD_API_FILEPATH
+
+
+def make_database() -> pd.DataFrame:
+    """
+    Load card data and clean up any issue found in the API.
+
+    Returns:
+        Card API dataframe
+    """
+    with open(CARD_API_FILEPATH) as fp:
+        data = json.load(fp)["cards"]
+
+    df = pd.DataFrame(data)
+    df["thumbs"] = df["images"].apply(lambda i: [j.split("/")[-1] for j in i["thumbs"]])
+    df["images"] = df["images"].apply(lambda i: [j.split("/")[-1] for j in i["full"]])
+    df["ex_burst"] = df["ex_burst"].apply(lambda i: i == "\u25cb" or i == "1").astype(bool)
+    df["multicard"] = df["multicard"].apply(lambda i: i == "\u25cb" or i == "1").astype(bool)
+    df["mono"] = df["element"].apply(lambda i: len(i) == 1 if i else True).astype(str)
+    df["element"] = df["element"].str.join("_")
+    df["power"] = df["power"].str.replace(" ", "").replace("\u2015", "").replace("\uff0d", "")
+
+    return df
+
+
+def extendDataset(ds: tf.data.Dataset,
+                  seed: int | None = None,
+                  name: str | None = None,
+                  batch_size: int | None = 32,
+                  shuffle: bool = True,
+                  reshuffle_each_iteration: bool = True,
+                  flip_horizontal: bool = False,
+                  flip_vertical: bool = True,
+                  brightness: float = 0.2,
+                  contrast: Tuple[float] | None = (0.5, 1.25),
+                  saturation: Tuple[float] | None = (0.65, 1.75),
+                  hue: float = 0.025) -> tf.data.Dataset:
+    """
+    Preprocess and add any extra augmented entries to the dataset.
+
+    Args:
+        ds (tf.data.DataFrame): All tensorflow image Dataset
+        seed (int): An optional integer used to create a random seed
+            (default is None)
+        name (str): Optional name for the Dataset
+            (default is None)
+        batch_size (int): Size of the batches of data.
+            If `None`, the data will not be batched
+            (default is 32)
+        shuffle (bool): Whether to shuffle the data.
+            If set to False, sorts the data in alphanumeric order.
+            (default is True)
+        reshuffle_each_iteration: Whether the shuffle order should
+            be different for each epoch
+            (default is True)
+        flip_horizontal (list): Add additional horizontal flipped images
+            (default is False)
+        flip_vertical (list): Add additional vertical flipped images
+            (default is True)
+        brightness (float): A delta randomly picked in the interval
+            [-max_delta, max_delta) applied across dataset
+            (default is 0.2)
+        contast (tuple[flost]): a contrast_factor randomly picked
+            in the interval [lower, upper) applied across the dataset
+            (default is (0.5, 1.25))
+        saturation (tuple[flost]):  a saturation_factor randomly picked
+            in the interval [lower, upper) applied across the dataset
+            (default is (0.65, 1.75))
+        hue (float): a delta randomly picked in the interval
+            [-max_delta, max_delta) applied across the dataset
+            (default is 0.025)
+
+    Returns:
+        Dataset
+    """
+    assert brightness >= 0.0, "brightness must be >= 0.0"
+
+    preprocess_layer = layers.Rescaling(1. / 255)
+    # preprocess_layer = layers.Rescaling(scale=1./127.5, offset=-1)
+
+    if name:
+        ds.element_spec[0]._name = f"orig_{name}"
+
+    ds = ds.map(tf.autograph.experimental.do_not_convert(
+        lambda x, y: (preprocess_layer(x), y)),
+        name=name
+    )
+    if flip_horizontal:
+        raise NotImplementedError("flip_horizontal")
+
+    if flip_vertical:
+        veritcal_ds = ds.map(
+            lambda x, y: (tf.image.flip_up_down(x), y),
+            name=f"vertical_{name}"
+        )
+        ds = ds.concatenate(veritcal_ds)
+
+    ds = ds.cache()
+
+    effects = []
+
+    if brightness:
+        ds = ds.map(
+            lambda x, y: (tf.clip_by_value(tf.image.random_brightness(x, brightness, seed=seed), 0.0, 1.0), y),
+            name=f"brightness_{name}"
+        )
+
+    if contrast:
+        ds = ds.map(
+            lambda x, y: (tf.clip_by_value(tf.image.random_contrast(x, *contrast, seed=seed), 0.0, 1.0), y),
+            name=f"contrast_{name}"
+        )
+
+    if saturation:
+        ds = ds.map(
+            lambda x, y: (tf.clip_by_value(tf.image.random_saturation(x, *saturation, seed=seed), 0.0, 1.0), y),
+            name=f"saturated_{name}"
+        )
+
+    if hue:
+        ds = ds.map(
+            lambda x, y: (tf.clip_by_value(tf.image.random_hue(x, hue, seed=seed), 0.0, 1.0), y),
+            name=f"hue_{name}"
+        )
+
+    for effect in effects:
+        ds = ds.concatenate(effect)
+
+    if shuffle:
+        ds = ds.shuffle(buffer_size=ds.cardinality(),
+                        seed=seed,
+                        reshuffle_each_iteration=reshuffle_each_iteration,
+                        name=f"shuffled_{name}")
+
+    if batch_size:
+        ds = ds.batch(batch_size, name=f"batch_{name}")
+        # ds.batch_size
+
+    ds = ds.prefetch(tf.data.AUTOTUNE)
+
+    ds.element_spec[0]._name = name
+
+    return ds
diff --git a/src/gatherdata.py → src/data/gather.py b/src/gatherdata.py → src/data/gather.py
@@ -22,9 +22,10 @@
 import pandas as pd
 from PIL import ImageFile, Image
 
-
-DATA_DIR = os.path.join(".", "data")
-CARD_API_FILEPATH = os.path.join(DATA_DIR, "cards.json")
+try:
+    from __init__ import MISSING_CARDS_FILEPATH, CARD_API_FILEPATH, DATA_DIR
+except ModuleNotFoundError:
+    from data import MISSING_CARDS_FILEPATH, CARD_API_FILEPATH, DATA_DIR
 
 
 def download_and_save() -> dict:
@@ -37,7 +38,7 @@ def download_and_save() -> dict:
     with requests.get("https://fftcg.square-enix-games.com/en/get-cards") as url:
         data = url.json()
 
-    with open(os.path.join(".", "src", "missing_cards.json")) as fp:
+    with open(MISSING_CARDS_FILEPATH) as fp:
         missing_cards = json.load(fp)
         for card in missing_cards:
             card = {key.lower(): value for key, value in card.items()}
@@ -58,7 +59,9 @@ def download_and_save() -> dict:
                     extra.append(v.replace("_eg.jpg", f"{lang}.jpg").replace("_eg_", f"{lang}_"))
 
             c["images"][d] += extra
-        if "image" in c: del c["image"]
+
+        if "image" in c:
+            del c["image"]
 
     if duplicates:
         for d, code in duplicates[::-1]:
@@ -75,10 +78,10 @@ def download_and_save() -> dict:
 
 
 async def download_image(img_url: str,
-                         subfolder: str='img',
-                         fname: typing.Any=None,
-                         crop: typing.Any=None,
-                         resize: typing.Any=None) -> str:
+                         subfolder: str = 'img',
+                         fname: typing.Any = None,
+                         crop: typing.Any = None,
+                         resize: typing.Any = None) -> str:
     """
     Download image and return on-disk destination.
 
@@ -149,18 +152,18 @@ async def main() -> None:
 
     df = pd.read_table("http://www.square-enix-shop.com/jp/ff-tcg/card/data/list_card.txt", header=None)
     df.rename({
-        0: "Code",
-        1: "Element",
-        2: "Name",
+        0: "code",
+        1: "element",
+        2: "name_ja",
         7: "image"
     }, axis=1, inplace=True)
 
     # Special case flip
-    df.replace({"Code": "PR-051/11-083R"},
-               {"Code": "11-083R/PR-051"},
+    df.replace({"code": "PR-051/11-083R"},
+               {"code": "11-083R/PR-051"},
                inplace=True)
-    df.replace({"Code": "PR-055/11-062R"},
-               {"Code": "11-062R/PR-055"},
+    df.replace({"code": "PR-055/11-062R"},
+               {"code": "11-062R/PR-055"},
                inplace=True)
 
     cleared_codes = []
@@ -171,12 +174,12 @@ async def main() -> None:
         if d[key].startswith("B-") or d[key].startswith("C-"):
             continue
 
-        rows = df.query(f"Code == '{d[key]}' or (Code.str.endswith('/{d[key]}') and Code.str.startswith('PR'))")
+        rows = df.query(f"code == '{d[key]}' or (code.str.endswith('/{d[key]}') and code.str.startswith('PR'))")
         if rows.empty and d[key] not in cleared_codes:
             raise Exception(f"Can't find '{d[key]}'")
         cleared_codes.append(d[key])
-        df.query(f"Code != '{d[key]}'", inplace=True)
-        df.query(f"~(Code.str.endswith('/{d[key]}') and Code.str.startswith('PR'))", inplace=True)
+        df.query(f"code != '{d[key]}'", inplace=True)
+        df.query(f"~(code.str.endswith('/{d[key]}') and code.str.startswith('PR'))", inplace=True)
 
         for idx, row in rows.iterrows():
             img_loc = row['image']
@@ -187,7 +190,8 @@ async def main() -> None:
             else:
                 fname = f"{d[key].split('/')[0]}_jp.jpg"
 
-            if "image" in d: del d["image"]
+            if "image" in d:
+                del d["image"]
 
             d["images"]["thumbs"].append(f"http://www.square-enix-shop.com/jp/ff-tcg/card/cimg/thumb/{fname}")
             images.append(download_image(f"http://www.square-enix-shop.com/jp/ff-tcg/card/cimg/thumb/{img_loc}",

diff --git a/src/data/visualize.ipynb b/src/data/visualize.ipynb