-
Notifications
You must be signed in to change notification settings - Fork 1
/
pdf_ocr.sh
41 lines (31 loc) · 1.29 KB
/
pdf_ocr.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#!/bin/bash
### Convert scanned pdf files to image and apply OCR for brazilian portuguese
ls *.pdf | while read pdf
do
#pdf="$(echo "$pdf" | sed 's/ /\\ /g')"
output_file="$(echo "$pdf" | sed 's/\.pdf//')"
tiff_file="$(echo "$pdf" | sed 's/\.pdf/\.tiff/')"
echo "scanned pdf to image: ${pdf}..."
convert -density 300 -background white -alpha Off "${pdf}" "${tiff_file}"
exit_code=$?
echo "status code: ${exit_code}"
echo ""
echo "apply OCR on image (output txt file): ${tiff_file}..."
#tesseract --tessdata-dir /usr/share "${tiff_file}" "${output_file}" -l por
tesseract "${tiff_file}" "${output_file}" -l por
exit_code=$?
echo "status code: ${exit_code}"
echo ""
echo "apply OCR on image (output searchable pdf): ${tiff_file}..."
#tesseract --tessdata-dir /usr/share "${tiff_file}" "${output_file}_searchable" -l por pdf
tesseract "${tiff_file}" "${output_file}_searchable" -l por pdf
exit_code=$?
echo "status code: ${exit_code}"
echo ""
echo "apply OCR on image (output hocr): ${tiff_file}..."
#tesseract --tessdata-dir /usr/share "${tiff_file}" "${output_file}" -l por hocr
tesseract "${tiff_file}" "${output_file}" -l por hocr
exit_code=$?
echo "status code: ${exit_code}"
echo ""
done