-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr.sh
executable file
·76 lines (59 loc) · 2.71 KB
/
ocr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/bash
# Check if at least a JSON file is provided as an argument
if [ $# -lt 1 ]; then
echo "Usage: $0 <json_file> [number_of_items]"
exit 1
fi
json_file="$1"
number_of_items="${2:-0}" # Default to 0, indicating all items
text_dir="$(dirname "$json_file")/text"
output_text_file="$text_dir/all_text.txt" # New text file to store all OCR results
# Create the text directory if it doesn't exist
mkdir -p "$text_dir"
# Check if the output text file exists; if not, create it
touch "$output_text_file"
# Process a specified number of items from the JSON file, or all if no number is specified
if [ "$number_of_items" -eq 0 ]; then
items_to_process='.files[]'
else
items_to_process=".files | .[:$number_of_items][]"
fi
jq -r "$items_to_process" "$json_file" | while read -r link; do
file_name=$(basename "$link")
if [ -e "$text_dir/$file_name.txt" ]; then
echo "File $file_name has already been processed. Skipping."
else
echo "Downloading $file_name..."
wget -q "$link" -O "$text_dir/$file_name"
echo "Converting $file_name to images..."
pdftoppm -jpeg -r 300 "$text_dir/$file_name" "$text_dir/${file_name%.*}"
# Remove the PDF file after conversion to images
rm "$text_dir/$file_name"
# for image_file in "$text_dir/${file_name%.*}"*; do
# echo "Performing OCR on $image_file..."
# # psm 11 separete it too much
# tesseract "$image_file" "$text_dir/$(basename ${image_file%.*})" --oem 1 --psm 12 -l bul
# cat "$text_dir/$(basename ${image_file%.*}).txt" >> "$output_text_file"
# # Remove the image file after OCR
# # rm "$image_file"
# done
for image_file in "$text_dir/${file_name%.*}"*; do
for psm in 3 6 11 12; do # Add other PSM values as needed
for oem in 0 1 2 3; do # Iterate over OEM values
# Create an output filename with OEM and PSM values included
output_base="$text_dir/$(basename ${image_file%.*})_oem${oem}_psm${psm}"
echo "Performing OCR on $image_file with OEM $oem and PSM $psm..."
tesseract "$image_file" "$output_base" --oem $oem --psm $psm -l bul
# Append the recognized text to the all_text file with OEM and PSM noted
echo "OCR result for OEM $oem, PSM $psm" >> "$output_text_file"
cat "${output_base}.txt" >> "$output_text_file"
done
done
done
fi
# If number_of_items is 1 or more, exit after processing the specified number of files
if [ "$number_of_items" -gt 0 ]; then
break
fi
done
echo "All requested files processed."