-
Notifications
You must be signed in to change notification settings - Fork 5
/
ocrd_tool.schema.yml
244 lines (244 loc) · 9.68 KB
/
ocrd_tool.schema.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
type: object
description: Schema for tools by OCR-D MP
required:
- version
- git_url
- tools
additionalProperties: false
properties:
version:
description: "Version of the tool, expressed as MAJOR.MINOR.PATCH."
type: string
pattern: '^[0-9]+\.[0-9]+\.[0-9]+$'
git_url:
description: GitHub/GitLab URL
type: string
format: url
dockerhub:
description: DockerHub image
type: string
tools:
type: object
additionalProperties: false
patternProperties:
'ocrd-.*':
type: object
additionalProperties: false
required:
- description
- steps
- executable
- categories
- input_file_grp_cardinality
- output_file_grp_cardinality
properties:
executable:
description: The name of the CLI executable in $PATH
type: string
input_file_grp:
deprecated: true
description: (DEPRECATED) Input fileGrp@USE this tool expects by default
type: array
items:
type: string
# pattern: '^OCR-D-[A-Z0-9-]+$'
output_file_grp:
deprecated: true
description: (DEPRECATED) Output fileGrp@USE this tool produces by default
type: array
items:
type: string
# pattern: '^OCR-D-[A-Z0-9-]+$'
input_file_grp_cardinality:
description: Number of (comma-separated) input fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited)
oneOf:
- type: number
multipleOf: 1
- type: array
items:
type: number
multipleOf: 1
minItems: 2
maxItems: 2
default: 1
output_file_grp_cardinality:
description: Number of (comma-separated) output fileGrp@USE this tool expects (either an exact value or a minimum,maximum list with -1 for unlimited)
oneOf:
- type: number
multipleOf: 1
- type: array
items:
type: number
multipleOf: 1
minItems: 2
maxItems: 2
default: 1
parameters:
description: Object describing the parameters of a tool. Keys are parameter names, values sub-schemas.
type: object
default: {}
patternProperties:
".*":
type: object
additionalProperties: false
required:
- description
- type
# also either 'default' or 'required'
properties:
type:
type: string
description: Data type of this parameter
enum:
- string
- number
- boolean
- object
- array
format:
description: Subtype, such as `float` for type `number` or `uri` for type `string`.
description:
description: Concise description of syntax and semantics of this parameter
items:
type: object
description: describe the items of an array further
minimum:
type: number
description: Minimum value for number parameters, including the minimum
maximum:
type: number
description: Maximum value for number parameters, including the maximum
minProperties:
type: number
description: Minimum number of properties of an object
maxProperties:
type: number
description: Maximum number of properties of an object
exclusiveMinimum:
type: number
description: Minimum value for number parameters, excluding the minimum
exclusiveMaximum:
type: number
description: Maximum value for number parameters, excluding the maximum
multipleOf:
type: number
description: For number values, those values must be multiple of this number
properties:
type: object
description: Describe the properties of an object value
additionalProperties:
oneOf:
- type: boolean
description: Whether an object value may contain properties not explicitly defined
- type: object
description: Schema any additional properties need to adhere to
required:
type: boolean
description: Whether this parameter is required
default:
description: Default value when not provided by the user
enum:
type: array
description: List the allowed values if a fixed list.
content-type:
type: string
default: 'application/octet-stream'
description: >
The media type of resources this processor expects for
this parameter. Most processors use files for resources
(e.g. `*.traineddata` for `ocrd-tesserocr-recognize`)
while others use directories of files (e.g. `default` for
`ocrd-eynollah-segment`). If a parameter requires
directories, it must set `content-type` to
`text/directory`.
cacheable:
type: boolean
description: "If parameter is reference to file: Whether the file should be cached, e.g. because it is large and won't change."
default: false
description:
description: Concise description of what the tool does
categories:
description: Tools belong to these categories, representing modules within the OCR-D project structure
type: array
items:
type: string
enum:
- Image preprocessing
- Layout analysis
- Text recognition and optimization
- Model training
- Long-term preservation
- Quality assurance
steps:
description: This tool can be used at these steps in the OCR-D functional model
type: array
items:
type: string
enum:
- preprocessing/characterization
- preprocessing/optimization
- preprocessing/optimization/cropping
- preprocessing/optimization/deskewing
- preprocessing/optimization/despeckling
- preprocessing/optimization/dewarping
- preprocessing/optimization/binarization
- preprocessing/optimization/grayscale_normalization
- recognition/text-recognition
- recognition/font-identification
- recognition/post-correction
- layout/segmentation
- layout/segmentation/text-nontext
- layout/segmentation/region
- layout/segmentation/line
- layout/segmentation/word
- layout/segmentation/classification
- layout/analysis
resource_locations:
type: array
description: The locations in the filesystem this processor supports for resource lookup
default: ['data', 'cwd', 'system', 'module']
items:
type: string
enum: ['data', 'cwd', 'system', 'module']
resources:
type: array
description: Resources for this processor
items:
type: object
additionalProperties: false
required:
- url
- description
- name
- size
properties:
url:
type: string
description: URLs of all components of this resource
description:
type: string
description: A description of the resource
name:
type: string
description: Name to store the resource as
type:
type: string
enum: ['file', 'directory', 'archive']
default: file
description: Type of the URL
parameter_usage:
type: string
description: Defines how the parameter is to be used
enum: ['as-is', 'without-extension']
default: 'as-is'
path_in_archive:
type: string
description: If type is archive, the resource is at this location in the archive
default: '.'
version_range:
type: string
description: Range of supported versions, syntax like in PEP 440
default: '>= 0.0.1'
size:
type: number
description: "Size of the resource in bytes to be retrieved (for archives: size of the archive)"