forked from mindspore-lab/mindocr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
borndigital.py
72 lines (60 loc) · 3.3 KB
/
borndigital.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
BDI - Born-Digital Images
https://rrc.cvc.uab.es/?ch=1
"""
import json
from pathlib import Path
import cv2
from shapely.geometry import Polygon
from tqdm import tqdm
class BORNDIGITAL_Converter:
"""
Format annotation to standard form for Born-Digital dataset.
The ground truth is provided in terms of word bounding boxes. Bounding boxes are specified by the coordinates of
their two opposite corners. For each image a corresponding UTF-8 encoded text file is provided,
following the naming convention:
`gt_[image name].txt`
The text files are comma separated files, where each line corresponds to one text block in the image and gives
its bounding box coordinates (top-right corner and bottom-left corner) and its transcription in the format:
`x1,y1,x2,y2,transcription`
Language is English only.
Note that the dataset also contains .gif files which are also handled by the converter.
"""
def __init__(self, path_mode="relative", **kwargs):
self._relative = path_mode == "relative"
def convert(self, task="det", image_dir=None, label_path=None, output_path=None):
label_path = Path(label_path)
assert label_path.exists(), f"{label_path} does not exist!"
if task == "det":
self._format_det_label(Path(image_dir), label_path, output_path)
else:
raise ValueError("Born-Digital currently supports only detection.")
def _format_det_label(self, image_dir: Path, label_path: Path, output_path: str):
with open(output_path, "w", encoding="utf-8") as out_file:
images = sorted(image_dir.iterdir(), key=lambda path: int(path.stem.split("_")[-1]))
for img_path in tqdm(images, total=len(images)):
label = []
with open(label_path / ("gt_" + img_path.stem + ".txt"), "r", encoding="utf-8") as label_file:
for line in label_file.readlines():
line = line.split(", ", 4) # Split the line into 5 parts (4 points + transcription)
line[:4] = [int(x) for x in line[:4]]
points = [[line[0], line[1]], [line[2], line[1]], [line[2], line[3]], [line[0], line[3]]]
if not Polygon(points).is_valid:
print(f"Warning {img_path.name}: skipping invalid polygon {line[:4]}")
continue
label.append(
{"transcription": line[4].strip()[1:-1], "points": points}
) # strip newline character and the default quotation marks
# Handle gif animation by storing as new image via video capturing
if img_path.suffix.lower() == ".gif":
new_path = image_dir / (img_path.stem + ".png")
if not new_path.exists():
cap = cv2.VideoCapture(str(img_path))
_, image = cap.read()
cap.release()
cv2.imwrite(str(new_path), image)
img_path = new_path
else:
continue
img_path = img_path.name if self._relative else str(img_path)
out_file.write(img_path + "\t" + json.dumps(label, ensure_ascii=False) + "\n")