Skip to content

Commit

Permalink
Move dataset interactions as submodules
Browse files Browse the repository at this point in the history
- Visualize the batch in 'visualize.ipnyb'
- Adjusting brightness and contrast requites clipping
- Remove obsolete data in 'missing_cards.json'
  • Loading branch information
Acbarakat committed Aug 20, 2023
1 parent 8487b45 commit 3a1338e
Show file tree
Hide file tree
Showing 6 changed files with 411 additions and 102 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ dmypy.json
.pyre/

# CrystalVision
data/
src/data/
/data/
img/
thumb/
frozen_models/
Expand Down
15 changes: 15 additions & 0 deletions src/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# -*- coding: utf-8 -*-
"""
Methods to work with test/train/validation data.
Todo:
* ???
"""
import os

SRC_DIR = os.path.join(os.path.dirname(__file__), "..")
DATA_DIR = os.path.join(SRC_DIR, "..", "data")

CARD_API_FILEPATH = os.path.join(DATA_DIR, "cards.json")
MISSING_CARDS_FILEPATH = os.path.join(SRC_DIR, "missing_cards.json")
154 changes: 154 additions & 0 deletions src/data/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# -*- coding: utf-8 -*-
import json
from typing import Tuple

import pandas as pd
import tensorflow as tf
from keras import layers

try:
from __init__ import CARD_API_FILEPATH
except ModuleNotFoundError:
from data import CARD_API_FILEPATH


def make_database() -> pd.DataFrame:
"""
Load card data and clean up any issue found in the API.
Returns:
Card API dataframe
"""
with open(CARD_API_FILEPATH) as fp:
data = json.load(fp)["cards"]

df = pd.DataFrame(data)
df["thumbs"] = df["images"].apply(lambda i: [j.split("/")[-1] for j in i["thumbs"]])
df["images"] = df["images"].apply(lambda i: [j.split("/")[-1] for j in i["full"]])
df["ex_burst"] = df["ex_burst"].apply(lambda i: i == "\u25cb" or i == "1").astype(bool)
df["multicard"] = df["multicard"].apply(lambda i: i == "\u25cb" or i == "1").astype(bool)
df["mono"] = df["element"].apply(lambda i: len(i) == 1 if i else True).astype(str)
df["element"] = df["element"].str.join("_")
df["power"] = df["power"].str.replace(" ", "").replace("\u2015", "").replace("\uff0d", "")

return df


def extendDataset(ds: tf.data.Dataset,
seed: int | None = None,
name: str | None = None,
batch_size: int | None = 32,
shuffle: bool = True,
reshuffle_each_iteration: bool = True,
flip_horizontal: bool = False,
flip_vertical: bool = True,
brightness: float = 0.2,
contrast: Tuple[float] | None = (0.5, 1.25),
saturation: Tuple[float] | None = (0.65, 1.75),
hue: float = 0.025) -> tf.data.Dataset:
"""
Preprocess and add any extra augmented entries to the dataset.
Args:
ds (tf.data.DataFrame): All tensorflow image Dataset
seed (int): An optional integer used to create a random seed
(default is None)
name (str): Optional name for the Dataset
(default is None)
batch_size (int): Size of the batches of data.
If `None`, the data will not be batched
(default is 32)
shuffle (bool): Whether to shuffle the data.
If set to False, sorts the data in alphanumeric order.
(default is True)
reshuffle_each_iteration: Whether the shuffle order should
be different for each epoch
(default is True)
flip_horizontal (list): Add additional horizontal flipped images
(default is False)
flip_vertical (list): Add additional vertical flipped images
(default is True)
brightness (float): A delta randomly picked in the interval
[-max_delta, max_delta) applied across dataset
(default is 0.2)
contast (tuple[flost]): a contrast_factor randomly picked
in the interval [lower, upper) applied across the dataset
(default is (0.5, 1.25))
saturation (tuple[flost]): a saturation_factor randomly picked
in the interval [lower, upper) applied across the dataset
(default is (0.65, 1.75))
hue (float): a delta randomly picked in the interval
[-max_delta, max_delta) applied across the dataset
(default is 0.025)
Returns:
Dataset
"""
assert brightness >= 0.0, "brightness must be >= 0.0"

preprocess_layer = layers.Rescaling(1. / 255)
# preprocess_layer = layers.Rescaling(scale=1./127.5, offset=-1)

if name:
ds.element_spec[0]._name = f"orig_{name}"

ds = ds.map(tf.autograph.experimental.do_not_convert(
lambda x, y: (preprocess_layer(x), y)),
name=name
)
if flip_horizontal:
raise NotImplementedError("flip_horizontal")

if flip_vertical:
veritcal_ds = ds.map(
lambda x, y: (tf.image.flip_up_down(x), y),
name=f"vertical_{name}"
)
ds = ds.concatenate(veritcal_ds)

ds = ds.cache()

effects = []

if brightness:
ds = ds.map(
lambda x, y: (tf.clip_by_value(tf.image.random_brightness(x, brightness, seed=seed), 0.0, 1.0), y),
name=f"brightness_{name}"
)

if contrast:
ds = ds.map(
lambda x, y: (tf.clip_by_value(tf.image.random_contrast(x, *contrast, seed=seed), 0.0, 1.0), y),
name=f"contrast_{name}"
)

if saturation:
ds = ds.map(
lambda x, y: (tf.clip_by_value(tf.image.random_saturation(x, *saturation, seed=seed), 0.0, 1.0), y),
name=f"saturated_{name}"
)

if hue:
ds = ds.map(
lambda x, y: (tf.clip_by_value(tf.image.random_hue(x, hue, seed=seed), 0.0, 1.0), y),
name=f"hue_{name}"
)

for effect in effects:
ds = ds.concatenate(effect)

if shuffle:
ds = ds.shuffle(buffer_size=ds.cardinality(),
seed=seed,
reshuffle_each_iteration=reshuffle_each_iteration,
name=f"shuffled_{name}")

if batch_size:
ds = ds.batch(batch_size, name=f"batch_{name}")
# ds.batch_size

ds = ds.prefetch(tf.data.AUTOTUNE)

ds.element_spec[0]._name = name

return ds
44 changes: 24 additions & 20 deletions src/gatherdata.py → src/data/gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@
import pandas as pd
from PIL import ImageFile, Image


DATA_DIR = os.path.join(".", "data")
CARD_API_FILEPATH = os.path.join(DATA_DIR, "cards.json")
try:
from __init__ import MISSING_CARDS_FILEPATH, CARD_API_FILEPATH, DATA_DIR
except ModuleNotFoundError:
from data import MISSING_CARDS_FILEPATH, CARD_API_FILEPATH, DATA_DIR


def download_and_save() -> dict:
Expand All @@ -37,7 +38,7 @@ def download_and_save() -> dict:
with requests.get("https://fftcg.square-enix-games.com/en/get-cards") as url:
data = url.json()

with open(os.path.join(".", "src", "missing_cards.json")) as fp:
with open(MISSING_CARDS_FILEPATH) as fp:
missing_cards = json.load(fp)
for card in missing_cards:
card = {key.lower(): value for key, value in card.items()}
Expand All @@ -58,7 +59,9 @@ def download_and_save() -> dict:
extra.append(v.replace("_eg.jpg", f"{lang}.jpg").replace("_eg_", f"{lang}_"))

c["images"][d] += extra
if "image" in c: del c["image"]

if "image" in c:
del c["image"]

if duplicates:
for d, code in duplicates[::-1]:
Expand All @@ -75,10 +78,10 @@ def download_and_save() -> dict:


async def download_image(img_url: str,
subfolder: str='img',
fname: typing.Any=None,
crop: typing.Any=None,
resize: typing.Any=None) -> str:
subfolder: str = 'img',
fname: typing.Any = None,
crop: typing.Any = None,
resize: typing.Any = None) -> str:
"""
Download image and return on-disk destination.
Expand Down Expand Up @@ -149,18 +152,18 @@ async def main() -> None:

df = pd.read_table("http://www.square-enix-shop.com/jp/ff-tcg/card/data/list_card.txt", header=None)
df.rename({
0: "Code",
1: "Element",
2: "Name",
0: "code",
1: "element",
2: "name_ja",
7: "image"
}, axis=1, inplace=True)

# Special case flip
df.replace({"Code": "PR-051/11-083R"},
{"Code": "11-083R/PR-051"},
df.replace({"code": "PR-051/11-083R"},
{"code": "11-083R/PR-051"},
inplace=True)
df.replace({"Code": "PR-055/11-062R"},
{"Code": "11-062R/PR-055"},
df.replace({"code": "PR-055/11-062R"},
{"code": "11-062R/PR-055"},
inplace=True)

cleared_codes = []
Expand All @@ -171,12 +174,12 @@ async def main() -> None:
if d[key].startswith("B-") or d[key].startswith("C-"):
continue

rows = df.query(f"Code == '{d[key]}' or (Code.str.endswith('/{d[key]}') and Code.str.startswith('PR'))")
rows = df.query(f"code == '{d[key]}' or (code.str.endswith('/{d[key]}') and code.str.startswith('PR'))")
if rows.empty and d[key] not in cleared_codes:
raise Exception(f"Can't find '{d[key]}'")
cleared_codes.append(d[key])
df.query(f"Code != '{d[key]}'", inplace=True)
df.query(f"~(Code.str.endswith('/{d[key]}') and Code.str.startswith('PR'))", inplace=True)
df.query(f"code != '{d[key]}'", inplace=True)
df.query(f"~(code.str.endswith('/{d[key]}') and code.str.startswith('PR'))", inplace=True)

for idx, row in rows.iterrows():
img_loc = row['image']
Expand All @@ -187,7 +190,8 @@ async def main() -> None:
else:
fname = f"{d[key].split('/')[0]}_jp.jpg"

if "image" in d: del d["image"]
if "image" in d:
del d["image"]

d["images"]["thumbs"].append(f"http://www.square-enix-shop.com/jp/ff-tcg/card/cimg/thumb/{fname}")
images.append(download_image(f"http://www.square-enix-shop.com/jp/ff-tcg/card/cimg/thumb/{img_loc}",
Expand Down
177 changes: 177 additions & 0 deletions src/data/visualize.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit 3a1338e

Please sign in to comment.