This repository has been archived by the owner on Feb 14, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 4
/
OCRmyFiles.sh
executable file
·198 lines (171 loc) · 5.64 KB
/
OCRmyFiles.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/bin/bash
# Bash script for adding a text layer to PDF files and converting images in PDFs (with OCR).
# Requirements:
# - OCRmyPDF: https://github.com/jbarlow83/OCRmyPDF
# Install instructions: https://ocrmypdf.readthedocs.io/en/latest/installation.html
# - Tesseract: https://github.com/tesseract-ocr/
# This is installed with OCRmyPDF automatically
# - Tesseract language files
# e.g. "apt-get install tesseract-ocr-deu" for German language
# Usage:
# - OCRmyFiles.sh (no parameter): using default directories for input/output
# - OCRmyFiles.sh <inputDir> <outputDir>: using specified directories for input/output
#
# Remarks:
# - After OCR, all files from the input directory are deleted. If you want to keep the input files, just comment out the call of the function 'cleanup_inputDir' at the end of the script.
# Default input/output directories
inputDirDefault="/var/OCR/Input"
outputDirDefault="/var/OCR/Output"
# General command line arguments for OCRmyPDF.
# Modify these to fit your needs.
# More information about command line arguments for OCRmyPDF: https://ocrmypdf.readthedocs.io/en/latest/cookbook.html
# -l deu+eng: Gives a hint for OCRmyPDF which languages are contained in the PDF (requires the corresponding tesseract language files to be installed)
# --output-type pdf: Creates a standard PDF as output (OCRmyPDF creates PDF/A documents by default)
ocrmypdfCmdArgs="-l deu+eng --output-type pdf"
# General command line arguments for tesseract calls (ONLY when converting image files to PDF).
# Modify these to fit yout needs.
# -l deu+eng: Gives a hint for tesseract which languages are contained in the image (requires the corresponding tesseract language files to be installed)
# pdf: Ouput should be PDF
imageConvertCmdArgs="-l deu+eng pdf"
countPDF=0
countImage=0
countCopy=0
# Function for error messages
errorecho() { cat <<< "$@" 1>&2; }
inputDir=$1
outputDir=$2
#
# Check for parameters
#
if [ $# != "0" ] && [ $# != "2" ]
then
errorecho "ERROR: Wrong number of parameters!"
errorecho "Usage: OCRmyPDF-script.sh (no parameter): using default directories for input/output"
errorecho "Usage: OCRmyPDF-script.sh <inputDir> <outputDir>: using specified directories for input/output"
exit 1
fi
#
# Check if OCRmyPDF is available
#
if ! hash ocrmypdf 2>/dev/null;
then
errorecho "ERROR: OCRmyPDF is not available!"
errorecho "Is OCRmyPDF installed?"
exit 1
fi
#
# Use default directories if none were specified
#
if [ -z "${inputDir}" ]
then
inputDir=${inputDirDefault}
echo "No input directory given, using the default input directory ${inputDir}"
echo
fi
if [ -z "${outputDir}" ]
then
outputDir=${outputDirDefault}
echo "No output directory given, using the default output directory ${outputDir}"
echo
fi
#
# Check if directories already exist
#
if [ ! -d "${inputDir}" ]
then
errorecho "ERROR: The input directory ${inputDir} does not exist!"
exit 1
fi
if [ ! -d "${outputDir}" ]
then
echo "The output directory does not exist -> creating ${outputDir}"
mkdir -p "${outputDir}"
echo
fi
#
# Locking
#
# The script should only run in one instance per input directory.
# So the lock directory is saved in the input directory, not under /var/lock
# Create a "lock" directory also in output directory to indicate that the script is currently running
lockdir="${inputDir}/.ocrmyfiles.lock"
runningdir="${outputDir}/.ocrmyfiles_running.lock"
if mkdir "$lockdir"
then
# Remove lockdir when the script finishes
trap cleanup_locks 0
else
errorecho "Script is currently running for input directory ${inputDir}, aborting..."
exit 1
fi
# Create a "lock" directory in output directory to indicate that the script is currently running
runningdir="${outputDir}/.ocrmyfiles_running.lock"
mkdir "$runningdir"
# Function to clean up locks
function cleanup_locks {
rm -rf "$lockdir"
rm -rf "$runningdir"
}
# Function to clean up the input directory
function cleanup_inputDir {
rm -rf "${inputDir:?}"/*
}
#
# Function to read the input directory and OCR all contained PDFs resursively
#
ocr_recursive() {
for i in "$1"/*;do
tmp="${i//"$inputDir"/""}"
# Skip lock directory
if [ "$i" = "$lockdir" ]; then
continue
fi
if [ -d "$i" ]; then
mkdir -p "${outputDir}${tmp}"
ocr_recursive "$i"
elif [ -f "$i" ]; then
fileType="$(file -b "$i")"
if [ -f "${outputDir}${tmp%.*}.pdf" ]; then
# If the file already exist in the output directory, skip it.
echo "File ${outputDir}${tmp%.*}.pdf already exists, skipping..."
continue
fi
if [ "${fileType%%,*}" == "PDF document" ]; then
# It's a PDF file -> OCR it
echo "Processing (PDF) $i -> ${outputDir}${tmp}"
if ! ocrmypdf ${ocrmypdfCmdArgs} "${i}" "${outputDir}${tmp}"
then
# Error while processing PDF file, maybe it already contains a text layer -> simply copy to output directory
cp "${i}" "${outputDir}${tmp}"
fi
echo "Done"
echo
countPDF=$((countPDF + 1))
elif [[ "${fileType}" = *"image data"* ]]; then
# It's an image -> convert to PDF and OCR it
echo "Processing (image) $i -> ${outputDir}${tmp%.*}.pdf"
fullpath="${outputDir}${tmp}"
tesseract "${i}" "${fullpath%.*}" ${imageConvertCmdArgs}
echo "Done"
echo
countImage=$((countImage + 1))
else
# Other file types -> just copy to output directory.
echo "Copy $i -> ${outputDir}${tmp}"
cp "${i}" "${outputDir}${tmp}"
echo "Done"
echo
countCopy=$((countCopy + 1))
fi
fi
done
}
shopt -s dotglob
ocr_recursive "${inputDir}"
cleanup_inputDir
shopt -u dotglob
echo
echo "Finished"
echo "PDF files processed: ${countPDF}"
echo "Image files processed: ${countImage}"
echo "Other files copied: ${countCopy}"