From 838126f5fecc52d303905ebf9a4b7a82fee2f39b Mon Sep 17 00:00:00 2001 From: Carl Wilson Date: Tue, 5 Mar 2024 18:06:41 +0000 Subject: [PATCH] REL: JHOVE v1.30 release candidate - bumped project versions: - JHOVE project version to 1.30.0-RC1; - PDF-hul to 1.12.5; - WAV-hul to 1.8.3; - XML-hul to 1.5.4; - PNG-gdm to 1.2; - JSON handler to 1.1; - XML handler to 1.11; - improved handling of module version changes with templating in `jhove-installer`; and - added baseline testing script for `create-1.30-target.sh`. --- jhove-apps/pom.xml | 2 +- jhove-bbt/scripts/create-1.30-target.sh | 147 + jhove-core/pom.xml | 2 +- .../hul/ois/jhove/handler/JsonHandler.java | 4 +- .../hul/ois/jhove/handler/XmlHandler.java | 6 +- jhove-ext-modules/pom.xml | 2 +- .../com/mcgath/jhove/module/PngModule.java | 12 +- jhove-installer/pom.xml | 35 +- jhove-installer/src/main/izpack/install.xml | 44 +- jhove-modules/aiff-hul/pom.xml | 2 +- jhove-modules/ascii-hul/pom.xml | 2 +- jhove-modules/gif-hul/pom.xml | 2 +- jhove-modules/html-hul/pom.xml | 4 +- jhove-modules/jpeg-hul/pom.xml | 2 +- jhove-modules/jpeg2000-hul/pom.xml | 2 +- jhove-modules/pdf-hul/pom.xml | 4 +- .../hul/ois/jhove/module/PdfModule.java | 8773 +++++++++-------- jhove-modules/pom.xml | 4 +- jhove-modules/tiff-hul/pom.xml | 2 +- jhove-modules/utf8-hul/pom.xml | 2 +- jhove-modules/wave-hul/pom.xml | 4 +- .../hul/ois/jhove/module/WaveModule.java | 4 +- jhove-modules/xml-hul/pom.xml | 4 +- .../hul/ois/jhove/module/XmlModule.java | 1952 ++-- pom.xml | 2 +- 25 files changed, 5589 insertions(+), 5430 deletions(-) create mode 100755 jhove-bbt/scripts/create-1.30-target.sh diff --git a/jhove-apps/pom.xml b/jhove-apps/pom.xml index 1ef75da71..afd0c7bf0 100644 --- a/jhove-apps/pom.xml +++ b/jhove-apps/pom.xml @@ -5,7 +5,7 @@ org.openpreservation.jhove jhove - 1.29.0-SNAPSHOT + 1.30.0-RC1 jhove-apps diff --git a/jhove-bbt/scripts/create-1.30-target.sh b/jhove-bbt/scripts/create-1.30-target.sh new file mode 100755 index 000000000..990f16d09 --- /dev/null +++ b/jhove-bbt/scripts/create-1.30-target.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash + +testRoot="test-root" +paramCandidateVersion="" +paramBaselineVersion="" +baselineRoot="${testRoot}/baselines" +candidateRoot="${testRoot}/candidates" +targetRoot="${testRoot}/targets" +# Check the passed params to avoid disapointment +checkParams () { + OPTIND=1 # Reset in case getopts previously used + + while getopts "h?b:c:" opt; do # Grab the options + case "$opt" in + h|\?) + showHelp + exit 0 + ;; + b) paramBaselineVersion=$OPTARG + ;; + c) paramCandidateVersion=$OPTARG + ;; + esac + done + + if [ -z "$paramBaselineVersion" ] || [ -z "$paramCandidateVersion" ] + then + showHelp + exit 0 + fi + + baselineRoot="${baselineRoot}/${paramBaselineVersion}" + candidateRoot="${candidateRoot}/${paramCandidateVersion}" + targetRoot="${targetRoot}/${paramCandidateVersion}" +} + +# Show usage message +showHelp() { + echo "usage: create-target [-b ] [-c ] [-h|?]" + echo "" + echo " baselineVersion : The version number id for the baseline data." + echo " candidateVersion : The version number id for the candidate data." + echo "" + echo " -h|? : This message." +} + +# Execution starts here +checkParams "$@"; +if [[ -d "${targetRoot}" ]]; then + echo " - removing existing baseline at ${targetRoot}." + rm -rf "${targetRoot}" +fi + +echo "TEST BASELINE: Creating baseline" +# Simply copy baseline for now we're not making any changes +echo " - copying ${baselineRoot} baseline to ${targetRoot}" +cp -R "${baselineRoot}" "${targetRoot}" + +# Update release details for ePub module +find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ EPUB-ptc<\/module>$/ EPUB-ptc<\/module>/' {} \; + +# Patch release details of the reporting module. +find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/outputHandler release="1.10">XML/outputHandler release="1.11">XML/' {} \; +find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/outputHandler release="1.1">JSON/outputHandler release="1.2">JSON/' {} \; + +# Update release details for PDF module +find "${targetRoot}" -type f -name "*.pdf.jhove.xml" -exec sed -i 's/^ PDF-hul<\/reportingModule>$/ PDF-hul<\/reportingModule>/' {} \; +find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ PDF-hul<\/module>$/ PDF-hul<\/module>/' {} \; +find "${targetRoot}" -type f -name "audit-PDF-hul.jhove.xml" -exec sed -i 's/^ 1.12.4<\/release>$/ 1.12.5<\/release>/' {} \; +find "${targetRoot}" -type f -name "audit-PDF-hul.jhove.xml" -exec sed -i 's/2023-03-16/2024-03-05/' {} \; + +# Update release details for PNG module +find "${targetRoot}" -type f -name "*.png.jhove.xml" -exec sed -i 's/^ PNG-gdm<\/reportingModule>$/ PNG-gdm<\/reportingModule>/' {} \; +find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ PNG-gdm<\/module>$/ PNG-gdm<\/module>/' {} \; +find "${targetRoot}" -type f -name "audit-PNG-gdm.jhove.xml" -exec sed -i 's/^ 1.2<\/release>$/ 1.3<\/release>/' {} \; +find "${targetRoot}" -type f -name "audit-PNG-gdm.jhove.xml" -exec sed -i 's/2023-03-16/2024-03-05/' {} \; + +# Update release details for WAVE module +find "${targetRoot}" -type f -name "*.wav.jhove.xml" -exec sed -i 's/^ WAVE-hul<\/reportingModule>$/ WAVE-hul<\/reportingModule>/' {} \; +find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ WAVE-hul<\/module>$/ WAVE-hul<\/module>/' {} \; +find "${targetRoot}" -type f -name "audit-WAVE-hul.jhove.xml" -exec sed -i 's/^ 1.8.2<\/release>$/ 1.8.3<\/release>/' {} \; +find "${targetRoot}" -type f -name "audit-WAVE-hul.jhove.xml" -exec sed -i 's/2022-04-22/2024-03-05/' {} \; + +# Update release details for XML module +find "${targetRoot}" -type f '(' -name "*.xml.jhove.xml" -o -name "*.ent.jhove.xml" -o -name "*.dtd.jhove.xml" ')' -exec sed -i 's/^ XML-hul<\/reportingModule>$/ XML-hul<\/reportingModule>/' {} \; +find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ XML-hul<\/module>$/ XML-hul<\/module>/' {} \; +find "${targetRoot}" -type f -name "audit-XML-hul.jhove.xml" -exec sed -i 's/^ 1.5.3<\/release>$/ 1.5.4<\/release>/' {} \; +find "${targetRoot}" -type f -name "audit-XML-hul.jhove.xml" -exec sed -i 's/2023-03-16/2024-03-05/' {} \; + +# Copy the XML file output changed by https://github.com/openpreserve/jhove/pull/889 +if [[ -f "${candidateRoot}/examples/modules/XML-hul/jhoveconf.xml.jhove.xml" ]]; then + cp "${candidateRoot}/examples/modules/XML-hul/jhoveconf.xml.jhove.xml" "${targetRoot}/examples/modules/XML-hul/jhoveconf.xml.jhove.xml" +fi + +# Copy the PDF Module results changed by https://github.com/openpreserve/jhove/pull/871 +if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pr_871_a.pdf.jhove.xml" ]]; then + cp "${candidateRoot}/regression/modules/PDF-hul/pr_871_a.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pr_871_a.pdf.jhove.xml" +fi +if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pr_871_b.pdf.jhove.xml" ]]; then + cp "${candidateRoot}/regression/modules/PDF-hul/pr_871_b.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pr_871_b.pdf.jhove.xml" +fi +if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pr_871_c.pdf.jhove.xml" ]]; then + cp "${candidateRoot}/regression/modules/PDF-hul/pr_871_c.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pr_871_c.pdf.jhove.xml" +fi + +# Copy the PDF Module results changed by https://github.com/openpreserve/jhove/pull/882 +if [[ -f "${candidateRoot}/errors/modules/PDF-hul/pdf-hul-10-govdocs-803945.pdf.jhove.xml" ]]; then + cp "${candidateRoot}/errors/modules/PDF-hul/pdf-hul-10-govdocs-803945.pdf.jhove.xml" "${targetRoot}/errors/modules/PDF-hul/pdf-hul-10-govdocs-803945.pdf.jhove.xml" +fi +if [[ -f "${candidateRoot}/errors/modules/PDF-hul/pdf-hul-5-govdocs-659152.pdf.jhove.xml" ]]; then + cp "${candidateRoot}/errors/modules/PDF-hul/pdf-hul-5-govdocs-659152.pdf.jhove.xml" "${targetRoot}/errors/modules/PDF-hul/pdf-hul-5-govdocs-659152.pdf.jhove.xml" +fi +if [[ -f "${candidateRoot}/regression/modules/PDF-hul/issue_306.pdf.jhove.xml" ]]; then + cp "${candidateRoot}/regression/modules/PDF-hul/issue_306.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/issue_306.pdf.jhove.xml" +fi + +# Copy the PNG Module results changed by https://github.com/openpreserve/jhove/pull/843 +if [[ -f "${candidateRoot}/regression/modules/PNG-gdm/issue_148.png.jhove.xml" ]]; then + cp "${candidateRoot}/regression/modules/PNG-gdm/issue_148.png.jhove.xml" "${targetRoot}/regression/modules/PNG-gdm/issue_148.png.jhove.xml" +fi + +declare -a indent_affected=("errors/modules/PDF-hul/pdf-hul-14-govdocs-489354.pdf.jhove.xml" + "errors/modules/PDF-hul/pdf-hul-9-govdocs-065694.pdf.jhove.xml" + "errors/modules/PDF-hul/pdf-hul-1-govdocs-519846.pdf.jhove.xml" + "errors/modules/PDF-hul/pdf-hul-49-32932439X.pdf.jhove.xml" + "errors/modules/JPEG2000-hul/is_jpx.jp2.jhove.xml" + "errors/modules/WAVE-hul/wf-pcm-44khz-8bit-mono-fmt-chunk-2-unrecognized-bytes.wav.jhove.xml" + "regression/modules/PNG-gdm/issue_694.png.jhove.xml" + "regression/modules/PDF-hul/null-string.pdf.jhove.xml" + "regression/modules/PDF-hul/pdf-hul-94-false-positive.pdf.jhove.xml" + "regression/modules/PDF-hul/issue_646.pdf.jhove.xml" + "regression/modules/PDF-hul/null-string-sig-2.pdf.jhove.xml" + "regression/modules/PDF-hul/null-string-sig-1.pdf.jhove.xml" + "regression/modules/PDF-hul/pdf-hul-40-govdocs-088919.pdf.jhove.xml" + "examples/modules/TIFF-hul/cramps.tif.jhove.xml" + "examples/modules/TIFF-hul/text.tif.jhove.xml" + "examples/modules/TIFF-hul/testpage-small.tif.jhove.xml" + "examples/modules/JPEG2000-hul/ROITest.jpx.jhove.xml" + "examples/modules/WAVE-hul/8-Bit-Noise-1.wav.jhove.xml" + "examples/modules/WAVE-hul/8-Bit-Noise-2.wav.jhove.xml" + ) +for filename in "${indent_affected[@]}" +do + if [[ -f "${candidateRoot}/${filename}" ]]; then + cp "${candidateRoot}/${filename}" "${targetRoot}/${filename}" + fi +done \ No newline at end of file diff --git a/jhove-core/pom.xml b/jhove-core/pom.xml index be11f4491..2a923bbfc 100644 --- a/jhove-core/pom.xml +++ b/jhove-core/pom.xml @@ -5,7 +5,7 @@ org.openpreservation.jhove jhove - 1.29.0-SNAPSHOT + 1.30.0-RC1 jhove-core diff --git a/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/JsonHandler.java b/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/JsonHandler.java index 0ed6d4291..eebd23059 100644 --- a/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/JsonHandler.java +++ b/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/JsonHandler.java @@ -68,13 +68,13 @@ public class JsonHandler extends HandlerBase { private static final String NAME = "JSON"; /** Handler release identifier. */ - private static final String RELEASE = "1.1"; + private static final String RELEASE = "1.2"; /** String release. */ private static final String RELEASE_CONSTANT = "release"; /** Handler release date. */ - private static final int[] DATE = { 2022, 04, 22 }; + private static final int[] DATE = { 2024, 03, 05 }; private static final String DATE_CONSTANT = "date"; diff --git a/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/XmlHandler.java b/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/XmlHandler.java index b417ea7f8..cd98de9fb 100644 --- a/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/XmlHandler.java +++ b/jhove-core/src/main/java/edu/harvard/hul/ois/jhove/handler/XmlHandler.java @@ -83,10 +83,10 @@ protected NumberFormat initialValue() { private static final String NAME = "XML"; /** Handler release identifier. */ - private static final String RELEASE = "1.10"; + private static final String RELEASE = "1.11"; /** Handler release date. */ - private static final int[] DATE = { 2023, 04, 18 }; + private static final int[] DATE = { 2024, 03, 05 }; /** Handler informative note. */ private static final String NOTE = "This output handler is defined by the XML Schema " @@ -751,7 +751,7 @@ protected void showProperty(Property property) { // as this could result in a schema violation. if (Utils.isPropertyEmpty(property, arity)) return; - + String margin = getIndent(++_level); String margn2 = margin + " "; String margn3 = margn2 + " "; diff --git a/jhove-ext-modules/pom.xml b/jhove-ext-modules/pom.xml index d24f44b76..288e19652 100644 --- a/jhove-ext-modules/pom.xml +++ b/jhove-ext-modules/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove jhove - 1.29.0-SNAPSHOT + 1.30.0-RC1 jhove-ext-modules diff --git a/jhove-ext-modules/src/main/java/com/mcgath/jhove/module/PngModule.java b/jhove-ext-modules/src/main/java/com/mcgath/jhove/module/PngModule.java index cc10d57b9..2d31f01e5 100644 --- a/jhove-ext-modules/src/main/java/com/mcgath/jhove/module/PngModule.java +++ b/jhove-ext-modules/src/main/java/com/mcgath/jhove/module/PngModule.java @@ -43,8 +43,8 @@ public class PngModule extends ModuleBase { ******************************************************************/ private static final String NAME = "PNG-gdm"; - private static final String RELEASE = "1.2"; - private static final int[] DATE = { 2023, 03, 16 }; + private static final String RELEASE = "1.3"; + private static final int[] DATE = { 2024, 03, 05 }; private static final String[] FORMAT = { "PNG", " ISO/IEC 15948:2003", "Portable Network Graphics" }; @@ -254,8 +254,8 @@ public int parse(InputStream stream, RepInfo info, int parseIndex) return 0; } catch (EOFException e) { JhoveMessage msg = JhoveMessages.getMessageInstance( - MessageConstants.PNG_GDM_69.getId(), - String.format(MessageConstants.PNG_GDM_69.getMessage(), _nByte)); + MessageConstants.PNG_GDM_69.getId(), + String.format(MessageConstants.PNG_GDM_69.getMessage(), _nByte)); info.setMessage(new ErrorMessage(msg)); info.setWellFormed(false); return 0; @@ -264,8 +264,8 @@ public int parse(InputStream stream, RepInfo info, int parseIndex) // But it's better to catch them than let them fall through. // Treat them as bugs. JhoveMessage msg = JhoveMessages.getMessageInstance( - MessageConstants.PNG_GDM_70.getId(), - String.format(MessageConstants.PNG_GDM_70.getMessage(), e.getClass().getName())); + MessageConstants.PNG_GDM_70.getId(), + String.format(MessageConstants.PNG_GDM_70.getMessage(), e.getClass().getName())); info.setMessage(new ErrorMessage(msg)); info.setWellFormed(false); return 0; diff --git a/jhove-installer/pom.xml b/jhove-installer/pom.xml index 96ce9bf64..805a94357 100644 --- a/jhove-installer/pom.xml +++ b/jhove-installer/pom.xml @@ -5,7 +5,7 @@ org.openpreservation.jhove jhove - 1.29.0-SNAPSHOT + 1.30.0-RC1 jhove-installer @@ -18,6 +18,17 @@ ${project.build.directory}/staging ${project.build.directory} ${project.build.scriptSourceDirectory} + 1.6.2 + 1.4.2 + 1.4.3 + 1.4.3 + 1.4.4 + 1.5.4 + 1.12.5 + 1.9.4 + 1.7.3 + 1.8.3 + 1.5.4 @@ -168,57 +179,57 @@ org.openpreservation.jhove.modules aiff-hul - 1.6.2 + ${aiff.hul.version} org.openpreservation.jhove.modules ascii-hul - 1.4.2 + ${ascii.hul.version} org.openpreservation.jhove.modules gif-hul - 1.4.3 + ${gif.hul.version} org.openpreservation.jhove.modules html-hul - 1.4.3 + ${html.hul.version} org.openpreservation.jhove.modules jpeg2000-hul - 1.4.4 + ${jpeg2000.hul.version} org.openpreservation.jhove.modules jpeg-hul - 1.5.4 + ${jpeg.hul.version} org.openpreservation.jhove.modules pdf-hul - 1.12.4 + ${pdf.hul.version} org.openpreservation.jhove.modules tiff-hul - 1.9.4 + ${tiff.hul.version} org.openpreservation.jhove.modules utf8-hul - 1.7.3 + ${utf8.hul.version} org.openpreservation.jhove.modules wave-hul - 1.8.2 + ${wave.hul.version} org.openpreservation.jhove.modules xml-hul - 1.5.3 + ${xml.hul.version} diff --git a/jhove-installer/src/main/izpack/install.xml b/jhove-installer/src/main/izpack/install.xml index c551208fd..f8ec2b446 100644 --- a/jhove-installer/src/main/izpack/install.xml +++ b/jhove-installer/src/main/izpack/install.xml @@ -61,28 +61,28 @@ JHOVE application JARs including the internal modules and configuration files. - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + diff --git a/jhove-modules/aiff-hul/pom.xml b/jhove-modules/aiff-hul/pom.xml index e940ddcb6..3db631dd9 100644 --- a/jhove-modules/aiff-hul/pom.xml +++ b/jhove-modules/aiff-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 aiff-hul 1.6.2 diff --git a/jhove-modules/ascii-hul/pom.xml b/jhove-modules/ascii-hul/pom.xml index 1dd846492..5e69b7a8d 100644 --- a/jhove-modules/ascii-hul/pom.xml +++ b/jhove-modules/ascii-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 ascii-hul 1.4.2 diff --git a/jhove-modules/gif-hul/pom.xml b/jhove-modules/gif-hul/pom.xml index d2e96b196..66f00008a 100644 --- a/jhove-modules/gif-hul/pom.xml +++ b/jhove-modules/gif-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 gif-hul 1.4.3 diff --git a/jhove-modules/html-hul/pom.xml b/jhove-modules/html-hul/pom.xml index 53992a74a..c948857ca 100644 --- a/jhove-modules/html-hul/pom.xml +++ b/jhove-modules/html-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 html-hul 1.4.3 @@ -14,7 +14,7 @@ org.openpreservation.jhove.modules xml-hul - 1.5.3 + 1.5.4 diff --git a/jhove-modules/jpeg-hul/pom.xml b/jhove-modules/jpeg-hul/pom.xml index 66551702d..1eacb6ad8 100644 --- a/jhove-modules/jpeg-hul/pom.xml +++ b/jhove-modules/jpeg-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 jpeg-hul 1.5.4 diff --git a/jhove-modules/jpeg2000-hul/pom.xml b/jhove-modules/jpeg2000-hul/pom.xml index 7593ef74b..8050c0d2d 100644 --- a/jhove-modules/jpeg2000-hul/pom.xml +++ b/jhove-modules/jpeg2000-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 jpeg2000-hul 1.4.4 diff --git a/jhove-modules/pdf-hul/pom.xml b/jhove-modules/pdf-hul/pom.xml index 3110a3897..c82114a0e 100644 --- a/jhove-modules/pdf-hul/pom.xml +++ b/jhove-modules/pdf-hul/pom.xml @@ -3,10 +3,10 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 pdf-hul - 1.12.4 + 1.12.5 JHOVE PDF Module HUL PDF module developed by Harvard University Library diff --git a/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java b/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java index e5a3b7893..0b861987d 100644 --- a/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java +++ b/jhove-modules/pdf-hul/src/main/java/edu/harvard/hul/ois/jhove/module/PdfModule.java @@ -111,4428 +111,4429 @@ */ public class PdfModule extends ModuleBase { - public static final String MIME_TYPE = "application/pdf"; - public static final String EXT = ".pdf"; - public static final int MAX_PAGE_TREE_DEPTH = 100; - public static final int MAX_OBJ_STREAM_DEPTH = 30; - - private static final String ENCODING_PREFIX = "ENC="; - - private static final String DEFAULT_PAGE_LAYOUT = "SinglePage"; - private static final String DEFAULT_MODE = "UseNone"; - - private static final String FILTER_NAME_CCITT = "CCITTFaxDecode"; - private static final String FILTER_NAME_CRYPT = "Crypt"; - private static final String FILTER_NAME_DCT = "DCTDecode"; - private static final String FILTER_NAME_FLATE = "FlateDecode"; - private static final String FILTER_NAME_JPX = "JPXDecode"; - private static final String FILTER_NAME_LZW = "LZWDecode"; - private static final String FILTER_NAME_RUN_LENGTH = "RunLengthDecode"; - - private static final String FILTER_VAL_STANDARD = "Standard"; - - private static final String RESOURCE_NAME_XOBJECT = "XObject"; - - private static final String FONT_TYPE0 = "Type0"; - private static final String FONT_TYPE1 = "Type1"; - private static final String FONT_TYPE3 = "Type3"; - private static final String FONT_MM_TYPE1 = "MMType1"; - private static final String FONT_TRUE_TYPE = "TrueType"; - private static final String FONT_CID_TYPE0 = "CIDFontType0"; - private static final String FONT_CID_TYPE2 = "CIDFontType2"; - - private static final String ACTION_VAL_GOTO = "GoTo"; - - private static final String DICT_KEY_DIRECTION = "Direction"; - - private static final String DICT_KEY_CENTER_WINDOW = "CenterWindow"; - private static final String DICT_KEY_DISP_DOC_TITLE = "DisplayDocTitle"; - private static final String DICT_KEY_FIT_WINDOW = "FitWindow"; - private static final String DICT_KEY_HIDE_MENUBAR = "HideMenubar"; - private static final String DICT_KEY_HIDE_TOOLBAR = "HideToolbar"; - private static final String DICT_KEY_HIDE_WINDOW_UI = "HideWindowUI"; - private static final String DICT_KEY_NO_FULL_PAGE = "NonFullScreenPageMode"; - private static final String DICT_KEY_PAGE_CLIP = "PageClip"; - private static final String DICT_KEY_PRINT_AREA = "PrintArea"; - private static final String DICT_KEY_VIEW_AREA = "ViewArea"; - private static final String DICT_KEY_VIEW_CLIP = "ViewClip"; - - private static final String PROP_NAME_CENTER_WINDOW = DICT_KEY_CENTER_WINDOW; - private static final String PROP_NAME_DISP_DOC_TITLE = DICT_KEY_DISP_DOC_TITLE; - private static final String PROP_NAME_FIT_WINDOW = DICT_KEY_FIT_WINDOW; - private static final String PROP_NAME_HIDE_MENUBAR = DICT_KEY_HIDE_MENUBAR; - private static final String PROP_NAME_HIDE_TOOLBAR = DICT_KEY_HIDE_TOOLBAR; - private static final String PROP_NAME_HIDE_WINDOW_UI = DICT_KEY_HIDE_WINDOW_UI; - private static final String PROP_NAME_NO_FULL_PAGE = DICT_KEY_NO_FULL_PAGE; - private static final String PROP_NAME_PAGE_CLIP = DICT_KEY_PAGE_CLIP; - private static final String PROP_NAME_PRINT_AREA = DICT_KEY_PRINT_AREA; - private static final String PROP_NAME_VIEW_AREA = DICT_KEY_VIEW_AREA; - private static final String PROP_NAME_VIEW_CLIP = DICT_KEY_VIEW_CLIP; - private static final String PROP_NAME_DIRECTION = DICT_KEY_DIRECTION; - - private static final String DICT_KEY_FONT_DESCRIPTOR = "FontDescriptor"; - private static final String DICT_KEY_STARTXREF = "startxref"; - private static final String DICT_KEY_BASE_FONT = "BaseFont"; - private static final String DICT_KEY_CONTENTS = "Contents"; - private static final String DICT_KEY_CID_INFO = "CIDSystemInfo"; - private static final String DICT_KEY_DIFFERENCES = "Differences"; - private static final String DICT_KEY_RESOURCES = "Resources"; - private static final String DICT_KEY_TO_UNICODE = "ToUnicode"; - private static final String DICT_KEY_ROOT = "Root"; - private static final String DICT_KEY_RECT = "Rect"; - private static final String DICT_KEY_DEST = "Dest"; - private static final String DICT_KEY_FIRST_CHAR = "FirstChar"; - private static final String DICT_KEY_LAST_CHAR = "LastChar"; - private static final String DICT_KEY_TRAILER = "trailer"; - private static final String DICT_KEY_SIZE = "Size"; - private static final String DICT_KEY_ENCRYPT = "Encrypt"; - private static final String DICT_KEY_STMF = "StmF"; - private static final String DICT_KEY_INFO = "Info"; - private static final String DICT_KEY_ID = "ID"; - private static final String DICT_KEY_FONT_NAME = "FontName"; - private static final String DICT_KEY_FONT_FILE = "FontFile"; - private static final String DICT_KEY_FONT_FILE_2 = "FontFile2"; - private static final String DICT_KEY_FONT_FILE_3 = "FontFile3"; - private static final String DICT_KEY_BBOX = "BBox"; - private static final String DICT_KEY_FONT_BBOX = "FontBBox"; - private static final String DICT_KEY_XREF_STREAM = "XRefStm"; - private static final String DICT_KEY_VIEWER_PREFS = "ViewerPreferences"; - private static final String DICT_KEY_PAGE_LAYOUT = "PageLayout"; - private static final String DICT_KEY_PAGE_MODE = "PageMode"; - private static final String DICT_KEY_OUTLINES = "Outlines"; - private static final String DICT_KEY_ORDERING = "Ordering"; - private static final String DICT_KEY_REGISTRY = "Registry"; - private static final String DICT_KEY_SUPPLEMENT = "Supplement"; - private static final String DICT_KEY_LANG = "Lang"; - private static final String DICT_KEY_PAGES = "Pages"; - private static final String DICT_KEY_PAGE_LABELS = "PageLabels"; - private static final String DICT_KEY_TYPE = "Type"; - private static final String DICT_KEY_VERSION = "Version"; - private static final String DICT_KEY_EXTENSIONS = "Extensions"; - private static final String DICT_KEY_EXTENSIONLEVEL = "ExtensionLevel"; - private static final String DICT_KEY_BASEVERSION = "BaseVersion"; - private static final String PROP_NAME_BASEVERSION = DICT_KEY_BASEVERSION; - private static final String PROP_NAME_EXTENSIONLEVEL = DICT_KEY_EXTENSIONLEVEL; - private static final String PROP_NAME_DEVELOPERPREFIX = "DeveloperPrefix"; - private static final String DICT_KEY_NAME = "Name"; - private static final String DICT_KEY_NAMES = "Names"; - private static final String DICT_KEY_EMBEDDED_FILES = "EmbeddedFiles"; - private static final String DICT_KEY_DESTS = "Dests"; - private static final String DICT_KEY_FILTER = "Filter"; - private static final String DICT_KEY_K = "K"; - private static final String DICT_KEY_P = "P"; - private static final String DICT_KEY_R = "R"; - private static final String DICT_KEY_V = "V"; - private static final String DICT_KEY_ENCODING = "Encoding"; - private static final String DICT_KEY_BASE_ENCODING = "BaseEncoding"; - private static final String DICT_KEY_LENGTH = "Length"; - private static final String DICT_KEY_WIDTH = "Width"; - private static final String DICT_KEY_HEIGHT = "Height"; - private static final String DICT_KEY_KEY_LENGTH = "KeyLength"; - private static final String DICT_KEY_TITLE = "Title"; - private static final String DICT_KEY_AUTHOR = "Author"; - private static final String DICT_KEY_SUBJECT = "Subject"; - private static final String DICT_KEY_KEYWORDS = "Keywords"; - private static final String DICT_KEY_CREATOR = "Creator"; - private static final String DICT_KEY_PRODUCER = "Producer"; - private static final String DICT_KEY_CREATION_DATE = "CreationDate"; - private static final String DICT_KEY_MODIFIED_DATE = "ModDate"; - private static final String DICT_KEY_TRAPPED = "Trapped"; - private static final String DICT_KEY_XOBJ_SUBTYPE = "Subtype"; - private static final String DICT_KEY_FONT_SUBTYPE = DICT_KEY_XOBJ_SUBTYPE; - private static final String DICT_KEY_DECODE_PARAMS = "DecodeParms"; - private static final String DICT_KEY_COLOR_SPACE = "ColorSpace"; - private static final String DICT_KEY_METADATA = "Metadata"; - private static final String DICT_KEY_BITS_PER_COMPONENT = "BitsPerComponent"; - private static final String DICT_KEY_INTENT = "Intent"; - private static final String DICT_KEY_IMAGE_MASK = "ImageMask"; - private static final String DICT_KEY_DECODE = "Decode"; - private static final String DICT_KEY_INTERPOLATE = "Interpolate"; - private static final String DICT_KEY_DESCENDANT_FONTS = "DescendantFonts"; - private static final String DICT_KEY_ROTATE = "Rotate"; - private static final String DICT_KEY_USER_UNIT = "UserUnit"; - private static final String DICT_KEY_VIEWPORT = "VP"; - private static final String DICT_KEY_THUMB = "Thumb"; - private static final String DICT_KEY_MEASURE = "Measure"; - private static final String DICT_KEY_COUNT = "Count"; - private static final String DICT_KEY_PARENT = "Parent"; - private static final String DICT_KEY_PREV = "Prev"; - private static final String DICT_KEY_NEXT = "Next"; - private static final String DICT_KEY_FIRST = "First"; - private static final String DICT_KEY_LAST = "Last"; - private static final String DICT_KEY_FLAGS = "Flags"; - - private static final String KEY_VAL_CATALOG = "Catalog"; - private static final String KEY_VAL_PAGES = "Pages"; - - private static final String PROP_NAME_BASE_FONT = DICT_KEY_BASE_FONT; - private static final String PROP_NAME_CALLOUT_LINE = "CalloutLine"; - private static final String PROP_NAME_CMAP_DICT = "CMapDictionary"; - private static final String PROP_NAME_CID_INFO = DICT_KEY_CID_INFO; - private static final String PROP_NAME_CID_INFOS = "CIDSystemInfos"; - private static final String PROP_NAME_CONTENTS = DICT_KEY_CONTENTS; - private static final String PROP_NAME_DISTANCE = "Distance"; - private static final String PROP_NAME_DIFFERENCES = DICT_KEY_DIFFERENCES; - private static final String PROP_NAME_ENCODING = DICT_KEY_ENCODING; - private static final String PROP_NAME_ENCODING_DICTIONARY = "EncodingDictionary"; - private static final String PROP_NAME_BASE_ENCODING = DICT_KEY_BASE_ENCODING; - private static final String PROP_NAME_EXTERNAL_STREAMS = "ExternalStreams"; - private static final String PROP_NAME_FILTER = DICT_KEY_FILTER; - private static final String PROP_NAME_FILTERS = "Filters"; - private static final String PROP_NAME_FILE = "File"; - private static final String PROP_NAME_FIRST_CHAR = DICT_KEY_FIRST_CHAR; - private static final String PROP_NAME_FLAGS = DICT_KEY_FLAGS; - private static final String PROP_NAME_AREA = "Area"; - private static final String PROP_NAME_IMAGE = "Image"; - private static final String PROP_NAME_IMAGES = "Images"; - private static final String PROP_NAME_OBJECTS = "Objects"; - private static final String PROP_NAME_RESOURCES = DICT_KEY_RESOURCES; - private static final String PROP_NAME_SUBTYPE = DICT_KEY_XOBJ_SUBTYPE; - private static final String PROP_NAME_FREE_OBJECTS = "FreeObjects"; - private static final String PROP_NAME_INC_UPDATES = "IncrementalUpdates"; - private static final String PROP_NAME_DOC_CATALOG = "DocumentCatalog"; - private static final String PROP_NAME_ENCRYPTION = "Encryption"; - private static final String PROP_NAME_KEY_LENGTH = DICT_KEY_KEY_LENGTH; - private static final String PROP_NAME_INFO = DICT_KEY_INFO; - private static final String PROP_NAME_DESTINATION = "Destination"; - private static final String PROP_NAME_CHILDREN = "Children"; - private static final String PROP_NAME_PAGE_LAYOUT = DICT_KEY_PAGE_LAYOUT; - private static final String PROP_NAME_LANG = "Language"; - private static final String PROP_NAME_LAST_CHAR = DICT_KEY_LAST_CHAR; - private static final String PROP_NAME_MEASURE = DICT_KEY_MEASURE; - private static final String PROP_NAME_SECURITY_HANDLER = "SecurityHandler"; - private static final String PROP_NAME_EFF = "EFF"; - private static final String PROP_NAME_ALGORITHM = "Algorithm"; - private static final String PROP_NAME_RECT = DICT_KEY_RECT; - private static final String PROP_NAME_REVISION = "Revision"; - private static final String PROP_NAME_OWNER_STRING = "OwnerString"; - private static final String PROP_NAME_USER_STRING = "UserString"; - private static final String PROP_NAME_OWNERKEY_STRING = "OwnerEncryptionKey"; - private static final String PROP_NAME_USERKEY_STRING = "UserEncryptionKey"; - private static final String PROP_NAME_USER_UNIT = DICT_KEY_USER_UNIT; - private static final String PROP_NAME_STANDARD_SECURITY_HANDLER = "StandardSecurityHandler"; - private static final String PROP_NAME_TITLE = DICT_KEY_TITLE; - private static final String PROP_NAME_AUTHOR = DICT_KEY_AUTHOR; - private static final String PROP_NAME_SUBJECT = DICT_KEY_SUBJECT; - private static final String PROP_NAME_KEYWORDS = DICT_KEY_KEYWORDS; - private static final String PROP_NAME_CREATOR = DICT_KEY_CREATOR; - private static final String PROP_NAME_PRODUCER = DICT_KEY_PRODUCER; - private static final String PROP_NAME_CREATION_DATE = DICT_KEY_CREATION_DATE; - private static final String PROP_NAME_MODIFIED_DATE = DICT_KEY_MODIFIED_DATE; - private static final String PROP_NAME_TRAPPED = DICT_KEY_TRAPPED; - private static final String PROP_NAME_FILTER_PIPELINE = "FilterPipeline"; - private static final String PROP_NAME_NISO_IMAGE_MD = "NisoImageMetadata"; - private static final String PROP_NAME_COLOR_SPACE = DICT_KEY_COLOR_SPACE; - private static final String PROP_NAME_ACTION_DEST = "ActionDest"; - private static final String PROP_NAME_ANNOTATION = "Annotation"; - private static final String PROP_NAME_APP_DICT = "AppearanceDictionary"; - private static final String PROP_NAME_INTENT = DICT_KEY_INTENT; - private static final String PROP_NAME_IMAGE_MASK = DICT_KEY_IMAGE_MASK; - private static final String PROP_NAME_DECODE = DICT_KEY_DECODE; - private static final String PROP_NAME_NAME = DICT_KEY_NAME; - private static final String PROP_NAME_ID = DICT_KEY_ID; - private static final String PROP_NAME_ITEM = "Item"; - private static final String PROP_NAME_INTERPOLATE = DICT_KEY_INTERPOLATE; - private static final String PROP_NAME_FONT_TYPE0 = FONT_TYPE0; - private static final String PROP_NAME_FONT_TYPE1 = FONT_TYPE1; - private static final String PROP_NAME_FONT_TYPE3 = FONT_TYPE3; - private static final String PROP_NAME_FONT_MM_TYPE1 = FONT_MM_TYPE1; - private static final String PROP_NAME_FONT_TRUE_TYPE = FONT_TRUE_TYPE; - private static final String PROP_NAME_FONT_CID_TYPE0 = FONT_CID_TYPE0; - private static final String PROP_NAME_FONT_CID_TYPE2 = FONT_CID_TYPE2; - private static final String PROP_NAME_FONT = "Font"; - private static final String PROP_NAME_FONTS = "Fonts"; - private static final String PROP_NAME_FONT_SUBSET = "FontSubset"; - private static final String PROP_NAME_FONT_BBOX = DICT_KEY_FONT_BBOX; - private static final String PROP_NAME_FONT_DESC = DICT_KEY_FONT_DESCRIPTOR; - private static final String PROP_NAME_FONT_FILE = DICT_KEY_FONT_FILE; - private static final String PROP_NAME_FONT_FILE_2 = DICT_KEY_FONT_FILE_2; - private static final String PROP_NAME_FONT_FILE_3 = DICT_KEY_FONT_FILE_3; - private static final String PROP_NAME_FONT_NAME = DICT_KEY_FONT_NAME; - private static final String PROP_NAME_PDF_METADATA = "PDFMetadata"; - private static final String PROP_NAME_LAST_MOD = "LastModified"; - private static final String PROP_NAME_OUTLINES = DICT_KEY_OUTLINES; - private static final String PROP_NAME_REGISTRY = DICT_KEY_REGISTRY; - private static final String PROP_NAME_SUPPLEMENT = DICT_KEY_SUPPLEMENT; - private static final String PROP_NAME_PAGES = DICT_KEY_PAGES; - private static final String PROP_NAME_SEQUENCE = "Sequence"; - private static final String PROP_NAME_ANNOTATIONS = "Annotations"; - private static final String PROP_NAME_ROTATE = DICT_KEY_ROTATE; - private static final String PROP_NAME_REPLY_TYPE = "ReplyType"; - private static final String PROP_NAME_VIEWPORT = "Viewport"; - private static final String PROP_NAME_VIEWPORTS = "Viewports"; - private static final String PROP_NAME_THUMB = DICT_KEY_THUMB; - private static final String PROP_NAME_TO_UNICODE = DICT_KEY_TO_UNICODE; - private static final String PROP_NAME_PAGE = "Page"; - private static final String PROP_NAME_LABEL = "Label"; - private static final String PROP_NAME_RATIO = "Ratio"; - - private static final String PROP_VAL_CROP_BOX = "CropBox"; - private static final String PROP_VAL_FONT_BBOX = DICT_KEY_FONT_BBOX; - private static final String PROP_VAL_NULL = "null"; - private static final String PROP_VAL_EXTERNAL = "External"; - private static final String PROP_VAL_NO_FLAGS_SET = "No flags set"; - private static final String XOBJ_SUBTYPE_IMAGE = PROP_NAME_IMAGE; - private static final String EMPTY_LABEL_PROPERTY = "[empty]"; - - /****************************************************************** - * PRIVATE CLASS FIELDS. - ******************************************************************/ - - private static final String NAME = "PDF-hul"; - private static final String RELEASE = "1.12.4"; - private static final int[] DATE = { 2023, 03, 16 }; - private static final String[] FORMAT = { "PDF", - "Portable Document Format" }; - private static final String COVERAGE = "PDF 1.0-1.6; " - + "PDF/X-1 (ISO 15930-1:2001), X-1a (ISO 15930-4:2003), " - + "X-2 (ISO 15930-5:2003), and X-3 (ISO 15930-6:2003); " - + "Tagged PDF; Linearized PDF"; - private static final String[] MIMETYPE = { MIME_TYPE }; - private static final String WELLFORMED = "A PDF file is " - + "well-formed if it meets the criteria defined in Chapter " - + "3 of the PDF Reference 1.6 (5th edition, 2004)"; - private static final String VALIDITY = null; - private static final String REPINFO = null; - private static final String NOTE = "This module does *not* validate data " - + "within content streams (including operators) or encrypted data"; - private static final String RIGHTS = "Copyright 2003-2007 by JSTOR and " - + "the President and Fellows of Harvard College. " - + "Released under the GNU Lesser General Public License."; - private static final String ENCRYPTED = ""; + public static final String MIME_TYPE = "application/pdf"; + public static final String EXT = ".pdf"; + public static final int MAX_PAGE_TREE_DEPTH = 100; + public static final int MAX_OBJ_STREAM_DEPTH = 30; + + private static final String ENCODING_PREFIX = "ENC="; + + private static final String DEFAULT_PAGE_LAYOUT = "SinglePage"; + private static final String DEFAULT_MODE = "UseNone"; + + private static final String FILTER_NAME_CCITT = "CCITTFaxDecode"; + private static final String FILTER_NAME_CRYPT = "Crypt"; + private static final String FILTER_NAME_DCT = "DCTDecode"; + private static final String FILTER_NAME_FLATE = "FlateDecode"; + private static final String FILTER_NAME_JPX = "JPXDecode"; + private static final String FILTER_NAME_LZW = "LZWDecode"; + private static final String FILTER_NAME_RUN_LENGTH = "RunLengthDecode"; + + private static final String FILTER_VAL_STANDARD = "Standard"; + + private static final String RESOURCE_NAME_XOBJECT = "XObject"; + + private static final String FONT_TYPE0 = "Type0"; + private static final String FONT_TYPE1 = "Type1"; + private static final String FONT_TYPE3 = "Type3"; + private static final String FONT_MM_TYPE1 = "MMType1"; + private static final String FONT_TRUE_TYPE = "TrueType"; + private static final String FONT_CID_TYPE0 = "CIDFontType0"; + private static final String FONT_CID_TYPE2 = "CIDFontType2"; + + private static final String ACTION_VAL_GOTO = "GoTo"; + + private static final String DICT_KEY_DIRECTION = "Direction"; + + private static final String DICT_KEY_CENTER_WINDOW = "CenterWindow"; + private static final String DICT_KEY_DISP_DOC_TITLE = "DisplayDocTitle"; + private static final String DICT_KEY_FIT_WINDOW = "FitWindow"; + private static final String DICT_KEY_HIDE_MENUBAR = "HideMenubar"; + private static final String DICT_KEY_HIDE_TOOLBAR = "HideToolbar"; + private static final String DICT_KEY_HIDE_WINDOW_UI = "HideWindowUI"; + private static final String DICT_KEY_NO_FULL_PAGE = "NonFullScreenPageMode"; + private static final String DICT_KEY_PAGE_CLIP = "PageClip"; + private static final String DICT_KEY_PRINT_AREA = "PrintArea"; + private static final String DICT_KEY_VIEW_AREA = "ViewArea"; + private static final String DICT_KEY_VIEW_CLIP = "ViewClip"; + + private static final String PROP_NAME_CENTER_WINDOW = DICT_KEY_CENTER_WINDOW; + private static final String PROP_NAME_DISP_DOC_TITLE = DICT_KEY_DISP_DOC_TITLE; + private static final String PROP_NAME_FIT_WINDOW = DICT_KEY_FIT_WINDOW; + private static final String PROP_NAME_HIDE_MENUBAR = DICT_KEY_HIDE_MENUBAR; + private static final String PROP_NAME_HIDE_TOOLBAR = DICT_KEY_HIDE_TOOLBAR; + private static final String PROP_NAME_HIDE_WINDOW_UI = DICT_KEY_HIDE_WINDOW_UI; + private static final String PROP_NAME_NO_FULL_PAGE = DICT_KEY_NO_FULL_PAGE; + private static final String PROP_NAME_PAGE_CLIP = DICT_KEY_PAGE_CLIP; + private static final String PROP_NAME_PRINT_AREA = DICT_KEY_PRINT_AREA; + private static final String PROP_NAME_VIEW_AREA = DICT_KEY_VIEW_AREA; + private static final String PROP_NAME_VIEW_CLIP = DICT_KEY_VIEW_CLIP; + private static final String PROP_NAME_DIRECTION = DICT_KEY_DIRECTION; + + private static final String DICT_KEY_FONT_DESCRIPTOR = "FontDescriptor"; + private static final String DICT_KEY_STARTXREF = "startxref"; + private static final String DICT_KEY_BASE_FONT = "BaseFont"; + private static final String DICT_KEY_CONTENTS = "Contents"; + private static final String DICT_KEY_CID_INFO = "CIDSystemInfo"; + private static final String DICT_KEY_DIFFERENCES = "Differences"; + private static final String DICT_KEY_RESOURCES = "Resources"; + private static final String DICT_KEY_TO_UNICODE = "ToUnicode"; + private static final String DICT_KEY_ROOT = "Root"; + private static final String DICT_KEY_RECT = "Rect"; + private static final String DICT_KEY_DEST = "Dest"; + private static final String DICT_KEY_FIRST_CHAR = "FirstChar"; + private static final String DICT_KEY_LAST_CHAR = "LastChar"; + private static final String DICT_KEY_TRAILER = "trailer"; + private static final String DICT_KEY_SIZE = "Size"; + private static final String DICT_KEY_ENCRYPT = "Encrypt"; + private static final String DICT_KEY_STMF = "StmF"; + private static final String DICT_KEY_INFO = "Info"; + private static final String DICT_KEY_ID = "ID"; + private static final String DICT_KEY_FONT_NAME = "FontName"; + private static final String DICT_KEY_FONT_FILE = "FontFile"; + private static final String DICT_KEY_FONT_FILE_2 = "FontFile2"; + private static final String DICT_KEY_FONT_FILE_3 = "FontFile3"; + private static final String DICT_KEY_BBOX = "BBox"; + private static final String DICT_KEY_FONT_BBOX = "FontBBox"; + private static final String DICT_KEY_XREF_STREAM = "XRefStm"; + private static final String DICT_KEY_VIEWER_PREFS = "ViewerPreferences"; + private static final String DICT_KEY_PAGE_LAYOUT = "PageLayout"; + private static final String DICT_KEY_PAGE_MODE = "PageMode"; + private static final String DICT_KEY_OUTLINES = "Outlines"; + private static final String DICT_KEY_ORDERING = "Ordering"; + private static final String DICT_KEY_REGISTRY = "Registry"; + private static final String DICT_KEY_SUPPLEMENT = "Supplement"; + private static final String DICT_KEY_LANG = "Lang"; + private static final String DICT_KEY_PAGES = "Pages"; + private static final String DICT_KEY_PAGE_LABELS = "PageLabels"; + private static final String DICT_KEY_TYPE = "Type"; + private static final String DICT_KEY_VERSION = "Version"; + private static final String DICT_KEY_EXTENSIONS = "Extensions"; + private static final String DICT_KEY_EXTENSIONLEVEL = "ExtensionLevel"; + private static final String DICT_KEY_BASEVERSION = "BaseVersion"; + private static final String PROP_NAME_BASEVERSION = DICT_KEY_BASEVERSION; + private static final String PROP_NAME_EXTENSIONLEVEL = DICT_KEY_EXTENSIONLEVEL; + private static final String PROP_NAME_DEVELOPERPREFIX = "DeveloperPrefix"; + private static final String DICT_KEY_NAME = "Name"; + private static final String DICT_KEY_NAMES = "Names"; + private static final String DICT_KEY_EMBEDDED_FILES = "EmbeddedFiles"; + private static final String DICT_KEY_DESTS = "Dests"; + private static final String DICT_KEY_FILTER = "Filter"; + private static final String DICT_KEY_K = "K"; + private static final String DICT_KEY_P = "P"; + private static final String DICT_KEY_R = "R"; + private static final String DICT_KEY_V = "V"; + private static final String DICT_KEY_ENCODING = "Encoding"; + private static final String DICT_KEY_BASE_ENCODING = "BaseEncoding"; + private static final String DICT_KEY_LENGTH = "Length"; + private static final String DICT_KEY_WIDTH = "Width"; + private static final String DICT_KEY_HEIGHT = "Height"; + private static final String DICT_KEY_KEY_LENGTH = "KeyLength"; + private static final String DICT_KEY_TITLE = "Title"; + private static final String DICT_KEY_AUTHOR = "Author"; + private static final String DICT_KEY_SUBJECT = "Subject"; + private static final String DICT_KEY_KEYWORDS = "Keywords"; + private static final String DICT_KEY_CREATOR = "Creator"; + private static final String DICT_KEY_PRODUCER = "Producer"; + private static final String DICT_KEY_CREATION_DATE = "CreationDate"; + private static final String DICT_KEY_MODIFIED_DATE = "ModDate"; + private static final String DICT_KEY_TRAPPED = "Trapped"; + private static final String DICT_KEY_XOBJ_SUBTYPE = "Subtype"; + private static final String DICT_KEY_FONT_SUBTYPE = DICT_KEY_XOBJ_SUBTYPE; + private static final String DICT_KEY_DECODE_PARAMS = "DecodeParms"; + private static final String DICT_KEY_COLOR_SPACE = "ColorSpace"; + private static final String DICT_KEY_METADATA = "Metadata"; + private static final String DICT_KEY_BITS_PER_COMPONENT = "BitsPerComponent"; + private static final String DICT_KEY_INTENT = "Intent"; + private static final String DICT_KEY_IMAGE_MASK = "ImageMask"; + private static final String DICT_KEY_DECODE = "Decode"; + private static final String DICT_KEY_INTERPOLATE = "Interpolate"; + private static final String DICT_KEY_DESCENDANT_FONTS = "DescendantFonts"; + private static final String DICT_KEY_ROTATE = "Rotate"; + private static final String DICT_KEY_USER_UNIT = "UserUnit"; + private static final String DICT_KEY_VIEWPORT = "VP"; + private static final String DICT_KEY_THUMB = "Thumb"; + private static final String DICT_KEY_MEASURE = "Measure"; + private static final String DICT_KEY_COUNT = "Count"; + private static final String DICT_KEY_PARENT = "Parent"; + private static final String DICT_KEY_PREV = "Prev"; + private static final String DICT_KEY_NEXT = "Next"; + private static final String DICT_KEY_FIRST = "First"; + private static final String DICT_KEY_LAST = "Last"; + private static final String DICT_KEY_FLAGS = "Flags"; + + private static final String KEY_VAL_CATALOG = "Catalog"; + private static final String KEY_VAL_PAGES = "Pages"; + + private static final String PROP_NAME_BASE_FONT = DICT_KEY_BASE_FONT; + private static final String PROP_NAME_CALLOUT_LINE = "CalloutLine"; + private static final String PROP_NAME_CMAP_DICT = "CMapDictionary"; + private static final String PROP_NAME_CID_INFO = DICT_KEY_CID_INFO; + private static final String PROP_NAME_CID_INFOS = "CIDSystemInfos"; + private static final String PROP_NAME_CONTENTS = DICT_KEY_CONTENTS; + private static final String PROP_NAME_DISTANCE = "Distance"; + private static final String PROP_NAME_DIFFERENCES = DICT_KEY_DIFFERENCES; + private static final String PROP_NAME_ENCODING = DICT_KEY_ENCODING; + private static final String PROP_NAME_ENCODING_DICTIONARY = "EncodingDictionary"; + private static final String PROP_NAME_BASE_ENCODING = DICT_KEY_BASE_ENCODING; + private static final String PROP_NAME_EXTERNAL_STREAMS = "ExternalStreams"; + private static final String PROP_NAME_FILTER = DICT_KEY_FILTER; + private static final String PROP_NAME_FILTERS = "Filters"; + private static final String PROP_NAME_FILE = "File"; + private static final String PROP_NAME_FIRST_CHAR = DICT_KEY_FIRST_CHAR; + private static final String PROP_NAME_FLAGS = DICT_KEY_FLAGS; + private static final String PROP_NAME_AREA = "Area"; + private static final String PROP_NAME_IMAGE = "Image"; + private static final String PROP_NAME_IMAGES = "Images"; + private static final String PROP_NAME_OBJECTS = "Objects"; + private static final String PROP_NAME_RESOURCES = DICT_KEY_RESOURCES; + private static final String PROP_NAME_SUBTYPE = DICT_KEY_XOBJ_SUBTYPE; + private static final String PROP_NAME_FREE_OBJECTS = "FreeObjects"; + private static final String PROP_NAME_INC_UPDATES = "IncrementalUpdates"; + private static final String PROP_NAME_DOC_CATALOG = "DocumentCatalog"; + private static final String PROP_NAME_ENCRYPTION = "Encryption"; + private static final String PROP_NAME_KEY_LENGTH = DICT_KEY_KEY_LENGTH; + private static final String PROP_NAME_INFO = DICT_KEY_INFO; + private static final String PROP_NAME_DESTINATION = "Destination"; + private static final String PROP_NAME_CHILDREN = "Children"; + private static final String PROP_NAME_PAGE_LAYOUT = DICT_KEY_PAGE_LAYOUT; + private static final String PROP_NAME_LANG = "Language"; + private static final String PROP_NAME_LAST_CHAR = DICT_KEY_LAST_CHAR; + private static final String PROP_NAME_MEASURE = DICT_KEY_MEASURE; + private static final String PROP_NAME_SECURITY_HANDLER = "SecurityHandler"; + private static final String PROP_NAME_EFF = "EFF"; + private static final String PROP_NAME_ALGORITHM = "Algorithm"; + private static final String PROP_NAME_RECT = DICT_KEY_RECT; + private static final String PROP_NAME_REVISION = "Revision"; + private static final String PROP_NAME_OWNER_STRING = "OwnerString"; + private static final String PROP_NAME_USER_STRING = "UserString"; + private static final String PROP_NAME_OWNERKEY_STRING = "OwnerEncryptionKey"; + private static final String PROP_NAME_USERKEY_STRING = "UserEncryptionKey"; + private static final String PROP_NAME_USER_UNIT = DICT_KEY_USER_UNIT; + private static final String PROP_NAME_STANDARD_SECURITY_HANDLER = "StandardSecurityHandler"; + private static final String PROP_NAME_TITLE = DICT_KEY_TITLE; + private static final String PROP_NAME_AUTHOR = DICT_KEY_AUTHOR; + private static final String PROP_NAME_SUBJECT = DICT_KEY_SUBJECT; + private static final String PROP_NAME_KEYWORDS = DICT_KEY_KEYWORDS; + private static final String PROP_NAME_CREATOR = DICT_KEY_CREATOR; + private static final String PROP_NAME_PRODUCER = DICT_KEY_PRODUCER; + private static final String PROP_NAME_CREATION_DATE = DICT_KEY_CREATION_DATE; + private static final String PROP_NAME_MODIFIED_DATE = DICT_KEY_MODIFIED_DATE; + private static final String PROP_NAME_TRAPPED = DICT_KEY_TRAPPED; + private static final String PROP_NAME_FILTER_PIPELINE = "FilterPipeline"; + private static final String PROP_NAME_NISO_IMAGE_MD = "NisoImageMetadata"; + private static final String PROP_NAME_COLOR_SPACE = DICT_KEY_COLOR_SPACE; + private static final String PROP_NAME_ACTION_DEST = "ActionDest"; + private static final String PROP_NAME_ANNOTATION = "Annotation"; + private static final String PROP_NAME_APP_DICT = "AppearanceDictionary"; + private static final String PROP_NAME_INTENT = DICT_KEY_INTENT; + private static final String PROP_NAME_IMAGE_MASK = DICT_KEY_IMAGE_MASK; + private static final String PROP_NAME_DECODE = DICT_KEY_DECODE; + private static final String PROP_NAME_NAME = DICT_KEY_NAME; + private static final String PROP_NAME_ID = DICT_KEY_ID; + private static final String PROP_NAME_ITEM = "Item"; + private static final String PROP_NAME_INTERPOLATE = DICT_KEY_INTERPOLATE; + private static final String PROP_NAME_FONT_TYPE0 = FONT_TYPE0; + private static final String PROP_NAME_FONT_TYPE1 = FONT_TYPE1; + private static final String PROP_NAME_FONT_TYPE3 = FONT_TYPE3; + private static final String PROP_NAME_FONT_MM_TYPE1 = FONT_MM_TYPE1; + private static final String PROP_NAME_FONT_TRUE_TYPE = FONT_TRUE_TYPE; + private static final String PROP_NAME_FONT_CID_TYPE0 = FONT_CID_TYPE0; + private static final String PROP_NAME_FONT_CID_TYPE2 = FONT_CID_TYPE2; + private static final String PROP_NAME_FONT = "Font"; + private static final String PROP_NAME_FONTS = "Fonts"; + private static final String PROP_NAME_FONT_SUBSET = "FontSubset"; + private static final String PROP_NAME_FONT_BBOX = DICT_KEY_FONT_BBOX; + private static final String PROP_NAME_FONT_DESC = DICT_KEY_FONT_DESCRIPTOR; + private static final String PROP_NAME_FONT_FILE = DICT_KEY_FONT_FILE; + private static final String PROP_NAME_FONT_FILE_2 = DICT_KEY_FONT_FILE_2; + private static final String PROP_NAME_FONT_FILE_3 = DICT_KEY_FONT_FILE_3; + private static final String PROP_NAME_FONT_NAME = DICT_KEY_FONT_NAME; + private static final String PROP_NAME_PDF_METADATA = "PDFMetadata"; + private static final String PROP_NAME_LAST_MOD = "LastModified"; + private static final String PROP_NAME_OUTLINES = DICT_KEY_OUTLINES; + private static final String PROP_NAME_REGISTRY = DICT_KEY_REGISTRY; + private static final String PROP_NAME_SUPPLEMENT = DICT_KEY_SUPPLEMENT; + private static final String PROP_NAME_PAGES = DICT_KEY_PAGES; + private static final String PROP_NAME_SEQUENCE = "Sequence"; + private static final String PROP_NAME_ANNOTATIONS = "Annotations"; + private static final String PROP_NAME_ROTATE = DICT_KEY_ROTATE; + private static final String PROP_NAME_REPLY_TYPE = "ReplyType"; + private static final String PROP_NAME_VIEWPORT = "Viewport"; + private static final String PROP_NAME_VIEWPORTS = "Viewports"; + private static final String PROP_NAME_THUMB = DICT_KEY_THUMB; + private static final String PROP_NAME_TO_UNICODE = DICT_KEY_TO_UNICODE; + private static final String PROP_NAME_PAGE = "Page"; + private static final String PROP_NAME_LABEL = "Label"; + private static final String PROP_NAME_RATIO = "Ratio"; + + private static final String PROP_VAL_CROP_BOX = "CropBox"; + private static final String PROP_VAL_FONT_BBOX = DICT_KEY_FONT_BBOX; + private static final String PROP_VAL_NULL = "null"; + private static final String PROP_VAL_EXTERNAL = "External"; + private static final String PROP_VAL_NO_FLAGS_SET = "No flags set"; + private static final String XOBJ_SUBTYPE_IMAGE = PROP_NAME_IMAGE; + private static final String EMPTY_LABEL_PROPERTY = "[empty]"; + + /****************************************************************** + * PRIVATE CLASS FIELDS. + ******************************************************************/ + + private static final String NAME = "PDF-hul"; + private static final String RELEASE = "1.12.5"; + private static final int[] DATE = { 2024, 03, 05 }; + private static final String[] FORMAT = { "PDF", + "Portable Document Format" }; + private static final String COVERAGE = "PDF 1.0-1.6; " + + "PDF/X-1 (ISO 15930-1:2001), X-1a (ISO 15930-4:2003), " + + "X-2 (ISO 15930-5:2003), and X-3 (ISO 15930-6:2003); " + + "Tagged PDF; Linearized PDF"; + private static final String[] MIMETYPE = { MIME_TYPE }; + private static final String WELLFORMED = "A PDF file is " + + "well-formed if it meets the criteria defined in Chapter " + + "3 of the PDF Reference 1.6 (5th edition, 2004)"; + private static final String VALIDITY = null; + private static final String REPINFO = null; + private static final String NOTE = "This module does *not* validate data " + + "within content streams (including operators) or encrypted data"; + private static final String RIGHTS = "Copyright 2003-2007 by JSTOR and " + + "the President and Fellows of Harvard College. " + + "Released under the GNU Lesser General Public License."; + private static final String ENCRYPTED = ""; private static final String SPEC_DOC_TITLE = "PDF Reference: Adobe Portable Document Format, Version "; - /** Logger for this class. */ - protected Logger _logger; - - /** Font type selectors. */ - public final static int F_TYPE0 = 1, F_TYPE1 = 2, F_TT = 3, F_TYPE3 = 4, - F_MM1 = 5, F_CID0 = 6, F_CID2 = 7; - - /****************************************************************** - * PRIVATE INSTANCE FIELDS. - ******************************************************************/ - - /** - * The maximum number of fonts that will be reported before we just - * give up and report a stub to avoid running out of memory. - */ - protected int DEFAULT_MAX_FONTS = 1000; - - /* Constants for trailer parsing */ - private static final int EOFSCANSIZE = 1024; - private static final int XREFSCANSIZE = 128; // generous... - - protected RandomAccessFile _raf; - protected Parser _parser; - protected String _version; - protected Property _metadata; - protected Property _xmpProp; - protected long _eof; - protected long _startxref; - protected long _prevxref; - protected int _numFreeObjects; - protected Property _idProperty; - protected int _objCount; // Count of objects in the cross-reference - // table - protected int _numObjects; // Value of the "Size" entry in the trailer - // dictionary - protected int _numTrailers; // Count of the number of trailers (updates) - protected Map _objects; // Map of the objects in the file - protected long[] _xref; // Array of object offsets from XRef table - protected int[][] _xref2; // Array of int[2], giving object stream and - // offset when _xref[i] < 0 - protected boolean _xrefIsStream; // True if XRef streams rather than tables - // are used - protected boolean _encrypted; // Equivalent to _encryptDictRef != null - protected boolean _streamsEncrypted; // streams are encrypted and can't be parsed. - protected List _docCatalogList; // Info extracted from doc cat dict - protected List _encryptList; // Info from encryption dict - protected List _docInfoList; // Info from doc info dict - protected List _extStreamsList; // List of external streams - protected List _imagesList; // List of image streams - protected List _filtersList; // List of filters - protected List _pagesList; // List of PageObjects - - /** Map of Type 0 font dictionaries. */ - protected Map _type0FontsMap; - /** Map of Type 1 font dictionaries. */ - protected Map _type1FontsMap; - /** Map of Multiple Master font dictionaries. */ - protected Map _mmFontsMap; - /** Map of Type 3 font dictionaries. */ - protected Map _type3FontsMap; - /** Map of TrueType font dictionaries. */ - protected Map _trueTypeFontsMap; - /** Map of CIDFont/Type 1 dictionaries. */ - protected Map _cid0FontsMap; - /** Map of CIDFont/TrueType dictionaries. */ - protected Map _cid2FontsMap; - - /** Map associating page object dictionaries with sequence numbers. */ - protected Map _pageSeqMap; - - protected PdfIndirectObj _docCatDictRef; - protected PdfIndirectObj _encryptDictRef; - protected PdfIndirectObj _docInfoDictRef; - protected PdfIndirectObj _pagesDictRef; - - protected PdfDictionary _docCatDict; - protected PdfDictionary _docInfoDict; - protected PageTreeNode _docTreeRoot; - protected PdfDictionary _pageLabelDict; - protected PageLabelNode _pageLabelRoot; - protected NameTreeNode _embeddedFiles; - protected NameTreeNode _destNames; - protected PdfDictionary _encryptDict; - protected PdfDictionary _trailerDict; - protected PdfDictionary _viewPrefDict; - protected PdfDictionary _outlineDict; - protected PdfDictionary _destsDict; - - protected boolean _showFonts; - protected boolean _showOutlines; - protected boolean _showAnnotations; - protected boolean _showPages; - - protected boolean _actionsExist; - protected boolean _pdfACompliant; // flag checking PDF/A compliance - - /** True if warning has been issued on recursive outlines. */ - protected boolean _recursionWarned; - - /* - * These three variables track whether certain messages have been posted - * notifying the user of omitted information. - */ - protected boolean _skippedFontsReported; - protected boolean _skippedOutlinesReported; - protected boolean _skippedAnnotationsReported; - protected boolean _skippedPagesReported; - - /** List of profile checkers. */ - protected List _profile; - - /** Cached object stream. */ - protected ObjectStream _cachedObjectStream; - - /** Object number of cached object stream. */ - protected int _cachedStreamIndex; - - /** Map of visited nodes when walking through an outline. */ - protected Set _visitedOutlineNodes; - - /** Maximum number of fonts to report full information on. */ - protected int maxFonts; - - /** Number of fonts reported so far. */ - protected int _nFonts; - - /* Name-to-value array pairs for NISO metadata */ - private final static String[] compressionStrings = { FILTER_NAME_LZW, - /* "FlateDecode", */ FILTER_NAME_RUN_LENGTH, FILTER_NAME_DCT, - FILTER_NAME_CCITT }; - private final static int[] compressionValues = { 5, /* 8, */ 32773, 6, 2 }; - /* - * The value of 2 (CCITTFaxDecode) is a placeholder; additional - * checking of the K parameter is needed to determine the real - * value if that's returned. - */ - - private final static String[] colorSpaceStrings = { "Lab", "DeviceRGB", - "DeviceCMYK", "DeviceGray", "Indexed" }; - private final static int[] colorSpaceValues = { 8, 2, 5, 1, 3 }; - - /****************************************************************** - * CLASS CONSTRUCTOR. - ******************************************************************/ - - /** - * Creates an instance of the module and initializes identifying - * information. - */ - public PdfModule() { - - super(NAME, RELEASE, DATE, FORMAT, COVERAGE, MIMETYPE, WELLFORMED, - VALIDITY, REPINFO, NOTE, RIGHTS, true); - - _logger = Logger.getLogger("edu.harvard.hul.ois.jhove.module"); - - _vendor = Agent.harvardInstance(); + /** Logger for this class. */ + protected Logger _logger; + + /** Font type selectors. */ + public final static int F_TYPE0 = 1, F_TYPE1 = 2, F_TT = 3, F_TYPE3 = 4, + F_MM1 = 5, F_CID0 = 6, F_CID2 = 7; + + /****************************************************************** + * PRIVATE INSTANCE FIELDS. + ******************************************************************/ + + /** + * The maximum number of fonts that will be reported before we just + * give up and report a stub to avoid running out of memory. + */ + protected int DEFAULT_MAX_FONTS = 1000; + + /* Constants for trailer parsing */ + private static final int EOFSCANSIZE = 1024; + private static final int XREFSCANSIZE = 128; // generous... + + protected RandomAccessFile _raf; + protected Parser _parser; + protected String _version; + protected Property _metadata; + protected Property _xmpProp; + protected long _eof; + protected long _startxref; + protected long _prevxref; + protected int _numFreeObjects; + protected Property _idProperty; + protected int _objCount; // Count of objects in the cross-reference + // table + protected int _numObjects; // Value of the "Size" entry in the trailer + // dictionary + protected int _numTrailers; // Count of the number of trailers (updates) + protected Map _objects; // Map of the objects in the file + protected long[] _xref; // Array of object offsets from XRef table + protected int[][] _xref2; // Array of int[2], giving object stream and + // offset when _xref[i] < 0 + protected boolean _xrefIsStream; // True if XRef streams rather than tables + // are used + protected boolean _encrypted; // Equivalent to _encryptDictRef != null + protected boolean _streamsEncrypted; // streams are encrypted and can't be parsed. + protected List _docCatalogList; // Info extracted from doc cat dict + protected List _encryptList; // Info from encryption dict + protected List _docInfoList; // Info from doc info dict + protected List _extStreamsList; // List of external streams + protected List _imagesList; // List of image streams + protected List _filtersList; // List of filters + protected List _pagesList; // List of PageObjects + + /** Map of Type 0 font dictionaries. */ + protected Map _type0FontsMap; + /** Map of Type 1 font dictionaries. */ + protected Map _type1FontsMap; + /** Map of Multiple Master font dictionaries. */ + protected Map _mmFontsMap; + /** Map of Type 3 font dictionaries. */ + protected Map _type3FontsMap; + /** Map of TrueType font dictionaries. */ + protected Map _trueTypeFontsMap; + /** Map of CIDFont/Type 1 dictionaries. */ + protected Map _cid0FontsMap; + /** Map of CIDFont/TrueType dictionaries. */ + protected Map _cid2FontsMap; + + /** Map associating page object dictionaries with sequence numbers. */ + protected Map _pageSeqMap; + + protected PdfIndirectObj _docCatDictRef; + protected PdfIndirectObj _encryptDictRef; + protected PdfIndirectObj _docInfoDictRef; + protected PdfIndirectObj _pagesDictRef; + + protected PdfDictionary _docCatDict; + protected PdfDictionary _docInfoDict; + protected PageTreeNode _docTreeRoot; + protected PdfDictionary _pageLabelDict; + protected PageLabelNode _pageLabelRoot; + protected NameTreeNode _embeddedFiles; + protected NameTreeNode _destNames; + protected PdfDictionary _encryptDict; + protected PdfDictionary _trailerDict; + protected PdfDictionary _viewPrefDict; + protected PdfDictionary _outlineDict; + protected PdfDictionary _destsDict; + + protected boolean _showFonts; + protected boolean _showOutlines; + protected boolean _showAnnotations; + protected boolean _showPages; + + protected boolean _actionsExist; + protected boolean _pdfACompliant; // flag checking PDF/A compliance + + /** True if warning has been issued on recursive outlines. */ + protected boolean _recursionWarned; + + /* + * These three variables track whether certain messages have been posted + * notifying the user of omitted information. + */ + protected boolean _skippedFontsReported; + protected boolean _skippedOutlinesReported; + protected boolean _skippedAnnotationsReported; + protected boolean _skippedPagesReported; + + /** List of profile checkers. */ + protected List _profile; + + /** Cached object stream. */ + protected ObjectStream _cachedObjectStream; + + /** Object number of cached object stream. */ + protected int _cachedStreamIndex; + + /** Map of visited nodes when walking through an outline. */ + protected Set _visitedOutlineNodes; + + /** Maximum number of fonts to report full information on. */ + protected int maxFonts; + + /** Number of fonts reported so far. */ + protected int _nFonts; + + /* Name-to-value array pairs for NISO metadata */ + private final static String[] compressionStrings = { FILTER_NAME_LZW, + /* "FlateDecode", */ FILTER_NAME_RUN_LENGTH, FILTER_NAME_DCT, + FILTER_NAME_CCITT }; + private final static int[] compressionValues = { 5, /* 8, */ 32773, 6, 2 }; + /* + * The value of 2 (CCITTFaxDecode) is a placeholder; additional + * checking of the K parameter is needed to determine the real + * value if that's returned. + */ + + private final static String[] colorSpaceStrings = { "Lab", "DeviceRGB", + "DeviceCMYK", "DeviceGray", "Indexed" }; + private final static int[] colorSpaceValues = { 8, 2, 5, 1, 3 }; + + /****************************************************************** + * CLASS CONSTRUCTOR. + ******************************************************************/ + + /** + * Creates an instance of the module and initializes identifying + * information. + */ + public PdfModule() { + + super(NAME, RELEASE, DATE, FORMAT, COVERAGE, MIMETYPE, WELLFORMED, + VALIDITY, REPINFO, NOTE, RIGHTS, true); + + _logger = Logger.getLogger("edu.harvard.hul.ois.jhove.module"); + + _vendor = Agent.harvardInstance(); Document doc = new Document(SPEC_DOC_TITLE + "1.4", DocumentType.BOOK); - Agent agent = Agent.newAdobeInstance(); - doc.setPublisher(agent); - doc.setDate("2001-12"); - doc.setEdition("3rd edition"); - doc.setIdentifier(new Identifier("0-201-75839-3", IdentifierType.ISBN)); - doc.setIdentifier(new Identifier( - "http://partners.adobe.com/asn/" + "acrobat/docs/File_Format_" - + "Specifications/PDFReference.pdf", - IdentifierType.URL)); - _specification.add(doc); + Agent agent = Agent.newAdobeInstance(); + doc.setPublisher(agent); + doc.setDate("2001-12"); + doc.setEdition("3rd edition"); + doc.setIdentifier(new Identifier("0-201-75839-3", IdentifierType.ISBN)); + doc.setIdentifier(new Identifier( + "http://partners.adobe.com/asn/" + "acrobat/docs/File_Format_" + + "Specifications/PDFReference.pdf", + IdentifierType.URL)); + _specification.add(doc); doc = new Document(SPEC_DOC_TITLE + "1.5", DocumentType.BOOK); - doc.setPublisher(agent); - doc.setDate("2003"); - doc.setEdition("4th edition"); - doc.setIdentifier(new Identifier( - "http://partners.adobe.com/public/developer/en/pdf/PDFReference15_v6.pdf", - IdentifierType.URL)); - _specification.add(doc); + doc.setPublisher(agent); + doc.setDate("2003"); + doc.setEdition("4th edition"); + doc.setIdentifier(new Identifier( + "http://partners.adobe.com/public/developer/en/pdf/PDFReference15_v6.pdf", + IdentifierType.URL)); + _specification.add(doc); doc = new Document(SPEC_DOC_TITLE + "1.6", DocumentType.BOOK); - doc.setPublisher(agent); - doc.setDate("2004-11"); - doc.setEdition("5th edition"); - doc.setIdentifier(new Identifier( - "http://partners.adobe.com/public/developer/en/pdf/PDFReference16.pdf", - IdentifierType.URL)); - _specification.add(doc); - - doc = new Document("Graphic technology -- Prepress " - + "digital data exchange -- Use of PDF -- " - + "Part 1: Complete exchange using CMYK data " - + "(PDF/X-1 and PDF/X-1a)", DocumentType.STANDARD); - Agent isoAgent = Agent.newIsoInstance(); - doc.setPublisher(isoAgent); - doc.setDate("2001-12-06"); - doc.setIdentifier( - new Identifier("ISO 15930-1:2001", IdentifierType.ISO)); - _specification.add(doc); - - doc = new Document("Graphic technology -- Prepress " - + "digital data exchange -- Use of PDF -- " - + "Part 4: Complete exchange using CMYK and " - + "spot colour printing data using " + "PDF 1.4 (PDF/X-1a)", - DocumentType.STANDARD); - doc.setPublisher(isoAgent); - doc.setDate("2003-08-04"); - doc.setIdentifier( - new Identifier("ISO 15930-4:2003", IdentifierType.ISO)); - _specification.add(doc); - - doc = new Document("Graphic technology -- Prepress " - + "digital data exchange -- Use of PDF -- " - + "Part 5: Partial exchange of printing data " - + "using PDF 1.4 (PDF/X-2)", DocumentType.STANDARD); - doc.setPublisher(isoAgent); - doc.setDate("2003-08-05"); - doc.setIdentifier( - new Identifier("ISO 15930-5:2003", IdentifierType.ISO)); - _specification.add(doc); - - doc = new Document("Graphic technology -- Prepress " - + "digital data exchange -- Use of PDF -- " - + "Part 6: Complete exchange suitable for " - + "colour-managed workflows using " + "PDF 1.4 (PDF/X-3)", - DocumentType.STANDARD); - doc.setPublisher(isoAgent); - doc.setDate("2003-08-06"); - doc.setIdentifier( - new Identifier("ISO 15930-6:2003", IdentifierType.ISO)); - _specification.add(doc); - - _signature.add(new ExternalSignature(EXT, SignatureType.EXTENSION, - SignatureUseType.OPTIONAL)); - _signature.add(new InternalSignature(PdfHeader.PDF_SIG_HEADER, - SignatureType.MAGIC, SignatureUseType.MANDATORY, 0)); - - doc = new Document( - "Document management -- Electronic " - + "document file format for long-term " - + "preservation -- Part 1: Use of PDF (PDF/A)", - DocumentType.RFC); - doc.setPublisher(isoAgent); - doc.setDate("2003-11-30"); - doc.setIdentifier(new Identifier("ISO/CD 19005-1", IdentifierType.ISO)); - doc.setIdentifier(new Identifier( - "http://www.aiim.org/documents/standards/ISO_19005-1_(E).doc", - IdentifierType.URL)); - _specification.add(doc); - - _profile = new ArrayList(6); - _profile.add(new LinearizedProfile(this)); - TaggedProfile tpr = new TaggedProfile(this); - _profile.add(tpr); - - /* - * CURRENT PDF/A PROFILING UNFIT FOR PURPOSE; SEE GITHUB ISSUE #101. - * - * AProfile apr = new AProfile(this); - * _profile.add(apr); - * // Link AProfile to TaggedProfile to save checking - * // the former twice. - * apr.setTaggedProfile(tpr); - * - * AProfileLevelA apra = new AProfileLevelA(this); - * _profile.add(apra); - * // AProfileLevelA depends on AProfile - * apra.setAProfile(apr); - */ - - X1Profile x1 = new X1Profile(this); - _profile.add(x1); - X1aProfile x1a = new X1aProfile(this); - _profile.add(x1a); - // Linking the X1 profile to the X1a profile saves checking the former - // twice. - x1a.setX1Profile(x1); - _profile.add(new X2Profile(this)); - _profile.add(new X3Profile(this)); - - _showAnnotations = false; - _showFonts = false; - _showOutlines = false; - _showPages = false; - maxFonts = DEFAULT_MAX_FONTS; - } - - /****************************************************************** - * PUBLIC INSTANCE METHODS. - * - * Parsing methods. - ******************************************************************/ - - /** - * Reset parameter settings. - * Returns to a default state without any parameters. - */ - @Override - public void resetParams() { - _showAnnotations = true; - _showFonts = true; - _showOutlines = true; - _showPages = true; - maxFonts = DEFAULT_MAX_FONTS; - } - - /** - * Per-action initialization. May be called multiple times. - * - * @param param - * The module parameter; under command-line Jhove, the -p - * parameter. - * If the parameter contains the indicated characters, then the - * specified information is omitted; otherwise, it is included. - * (This is the reverse of the behavior prior to beta 3.) - * These characters may be provided as separate parameters, - * or all in a single parameter. - *
    - *
  • a: annotations
  • - *
  • f: fonts
  • - *
  • o: outlines
  • - *
  • p: pages
  • - *
- *
- * The parameter is case-independent. A null parameter is - * equivalent to the empty string. - */ - @Override - public void param(String param) { - if (param != null) { - param = param.toLowerCase(); - if (param.indexOf('a') >= 0) { - _showAnnotations = false; - } - if (param.indexOf('f') >= 0) { - _showFonts = false; - } - if (param.indexOf('o') >= 0) { - _showOutlines = false; - } - if (param.indexOf('p') >= 0) { - _showPages = false; - } - if (param.indexOf('n') >= 0) { - // Parse out the number after the n, and use that to set - // the maximum number of fonts reported. Default is - // DEFAULT_MAX_FONTS. - int n = param.indexOf('n'); - StringBuffer b = new StringBuffer(); - for (int i = n + 1; i < param.length(); i++) { - char ch = param.charAt(i); - if (Character.isDigit(ch)) { - b.append(ch); - } else { - break; - } - } - try { - int mx = Integer.parseInt(b.toString()); - if (mx > 0) { - maxFonts = mx; - } - } catch (Exception e) { - } - } - } - } - - /** - * Parses a file and stores descriptive information. A RandomAccessFile - * must be used to represent the object. - * - * @param raf - * A PDF file - * @param info - * A clean RepInfo object, which will be modified to hold - * the descriptive information - */ - @Override - public final void parse(RandomAccessFile raf, RepInfo info) - throws IOException { - initParse(); - initInfo(info); - _objects = new HashMap<>(); - _raf = raf; - - Tokenizer tok = new FileTokenizer(_raf); - _parser = new Parser(tok); - _parser.setObjectMap(_objects); - - List metadataList = new ArrayList(11); - /* - * We construct a big whopping property, - * which contains up to 11 subproperties - */ - _metadata = new Property(PROP_NAME_PDF_METADATA, PropertyType.PROPERTY, - PropertyArity.LIST, metadataList); - - if (_raf.length() > 10000000000L) { // that's 10^10 - _pdfACompliant = false; // doesn't meet size limit in Appendix C - // of PDF spec - } - if (!parseHeader(info)) { - return; - } - if (!findLastTrailer(info)) { - return; - } - - /* - * Walk through the linked trailer and cross reference - * sections. - */ - _prevxref = -1; - boolean lastTrailer = true; - while (_startxref > 0) { - // After the first (last) trailer, parse only for next "Prev" link - if (!parseTrailer(info, !lastTrailer)) { - return; - } - if (!readXRefInfo(info)) { - return; - } - ++_numTrailers; - if (_xrefIsStream) { - /* - * If we have an xref stream, readXRefInfo dealt with all - * the streams in a single call. - */ - break; - } - // Beware infinite loop on badly broken file - if (_startxref == _prevxref) { - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_134, // PDF-HUL-134 - _parser.getOffset())); - info.setWellFormed(false); - return; - } - _startxref = _prevxref; - lastTrailer = false; - } - if (!readDocCatalogDict(info)) { - return; - } - if (!readEncryptDict(info)) { - return; - } - if (!readDocInfoDict(info)) { - return; - } - if (!readDocumentTree(info)) { - return; - } - if (!readPageLabelTree(info)) { - return; - } - if (!readXMPData(info)) { - return; - } - findExternalStreams(info); - if (!findFilters(info) && !_streamsEncrypted) { - return; - } - findImages(info); - findFonts(info); - - /* Object is well-formed PDF. */ - - // Calculate checksums if not already present - checksumIfRafNotCopied(info, raf); - - info.setVersion(_version); - metadataList.add(new Property(PROP_NAME_OBJECTS, PropertyType.INTEGER, - new Integer(_numObjects))); - metadataList.add(new Property(PROP_NAME_FREE_OBJECTS, - PropertyType.INTEGER, new Integer(_numFreeObjects))); - metadataList.add(new Property(PROP_NAME_INC_UPDATES, - PropertyType.INTEGER, new Integer(_numTrailers))); - if (_docCatalogList != null) { - metadataList.add( - new Property(PROP_NAME_DOC_CATALOG, PropertyType.PROPERTY, - PropertyArity.LIST, _docCatalogList)); - } - if (_encryptList != null) { - metadataList.add(new Property(PROP_NAME_ENCRYPTION, - PropertyType.PROPERTY, PropertyArity.LIST, _encryptList)); - } - if (_docInfoList != null) { - metadataList.add(new Property(PROP_NAME_INFO, PropertyType.PROPERTY, - PropertyArity.LIST, _docInfoList)); - } - if (_idProperty != null) { - metadataList.add(_idProperty); - } - if (_extStreamsList != null && !_extStreamsList.isEmpty()) { - metadataList.add(new Property(PROP_NAME_EXTERNAL_STREAMS, - PropertyType.PROPERTY, PropertyArity.LIST, - _extStreamsList)); - } - if (_filtersList != null && !_filtersList.isEmpty()) { - metadataList.add(new Property(PROP_NAME_FILTERS, - PropertyType.PROPERTY, PropertyArity.LIST, _filtersList)); - } - if (_imagesList != null && !_imagesList.isEmpty()) { - metadataList.add(new Property(PROP_NAME_IMAGES, - PropertyType.PROPERTY, PropertyArity.LIST, _imagesList)); - } - if (_showFonts || _verbosity == Module.MAXIMUM_VERBOSITY) { - try { - addFontsProperty(metadataList); - } catch (NullPointerException e) { - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_135, - e.toString())); // PDF-HUL-135 - } - } - if (_nFonts > maxFonts) { - info.setMessage(new InfoMessage(MessageConstants.PDF_HUL_136, // PDF-HUL-136 - MessageConstants.PDF_HUL_136_SUB.getMessage() + _nFonts)); - } - if (_xmpProp != null) { - metadataList.add(_xmpProp); - } - addPagesProperty(metadataList, info); - - if (!doOutlineStuff(info)) { - return; - } - - info.setProperty(_metadata); - - /* Check for profile conformance. */ - - if (!_parser.getPDFACompliant()) { - _pdfACompliant = false; - } - if (info.getWellFormed() == RepInfo.TRUE) { - // Well-formedness is necessary to satisfy any profile. - ListIterator pter = _profile.listIterator(); - while (pter.hasNext()) { - PdfProfile prof = pter.next(); - if (prof.satisfiesProfile(_raf, _parser)) { - info.setProfile(prof.getText()); - } - } - } - } - - /** - * Returns true if the module hasn't detected any violations - * of PDF/A compliance. This must return true, but is not - * sufficient by itself, to establish compliance. The - * AProfile profiler makes the final determination. - */ - public boolean mayBePDFACompliant() { - return _pdfACompliant; - } - - /** - * Returns the document tree root. - */ - public PageTreeNode getDocumentTree() { - return _docTreeRoot; - } - - /** - * Returns the document information dictionary. - */ - public PdfDictionary getDocInfo() { - return _docInfoDict; - } - - /** - * Returns the encryption dictionary. - */ - public PdfDictionary getEncryptionDict() { - return _encryptDict; - } - - /** - * Return true if Actions have been detected in the file. - */ - public boolean getActionsExist() { - return _actionsExist; - } - - /** - * Initialize the module. This is called at the start - * of parse restore the module to its initial state. - */ - @Override - protected final void initParse() { - super.initParse(); - _xref = null; - _xref2 = null; - _version = ""; - _objects = null; - _numFreeObjects = 0; - _objCount = 0; - _docInfoList = null; - _extStreamsList = null; - _docCatalogList = null; - _encryptList = null; - _imagesList = null; - _filtersList = null; - _pagesList = null; - _type0FontsMap = null; - _type1FontsMap = null; - _mmFontsMap = null; - _type3FontsMap = null; - _trueTypeFontsMap = null; - _cid0FontsMap = null; - _cid2FontsMap = null; - _docCatDictRef = null; + doc.setPublisher(agent); + doc.setDate("2004-11"); + doc.setEdition("5th edition"); + doc.setIdentifier(new Identifier( + "http://partners.adobe.com/public/developer/en/pdf/PDFReference16.pdf", + IdentifierType.URL)); + _specification.add(doc); + + doc = new Document("Graphic technology -- Prepress " + + "digital data exchange -- Use of PDF -- " + + "Part 1: Complete exchange using CMYK data " + + "(PDF/X-1 and PDF/X-1a)", DocumentType.STANDARD); + Agent isoAgent = Agent.newIsoInstance(); + doc.setPublisher(isoAgent); + doc.setDate("2001-12-06"); + doc.setIdentifier( + new Identifier("ISO 15930-1:2001", IdentifierType.ISO)); + _specification.add(doc); + + doc = new Document("Graphic technology -- Prepress " + + "digital data exchange -- Use of PDF -- " + + "Part 4: Complete exchange using CMYK and " + + "spot colour printing data using " + "PDF 1.4 (PDF/X-1a)", + DocumentType.STANDARD); + doc.setPublisher(isoAgent); + doc.setDate("2003-08-04"); + doc.setIdentifier( + new Identifier("ISO 15930-4:2003", IdentifierType.ISO)); + _specification.add(doc); + + doc = new Document("Graphic technology -- Prepress " + + "digital data exchange -- Use of PDF -- " + + "Part 5: Partial exchange of printing data " + + "using PDF 1.4 (PDF/X-2)", DocumentType.STANDARD); + doc.setPublisher(isoAgent); + doc.setDate("2003-08-05"); + doc.setIdentifier( + new Identifier("ISO 15930-5:2003", IdentifierType.ISO)); + _specification.add(doc); + + doc = new Document("Graphic technology -- Prepress " + + "digital data exchange -- Use of PDF -- " + + "Part 6: Complete exchange suitable for " + + "colour-managed workflows using " + "PDF 1.4 (PDF/X-3)", + DocumentType.STANDARD); + doc.setPublisher(isoAgent); + doc.setDate("2003-08-06"); + doc.setIdentifier( + new Identifier("ISO 15930-6:2003", IdentifierType.ISO)); + _specification.add(doc); + + _signature.add(new ExternalSignature(EXT, SignatureType.EXTENSION, + SignatureUseType.OPTIONAL)); + _signature.add(new InternalSignature(PdfHeader.PDF_SIG_HEADER, + SignatureType.MAGIC, SignatureUseType.MANDATORY, 0)); + + doc = new Document( + "Document management -- Electronic " + + "document file format for long-term " + + "preservation -- Part 1: Use of PDF (PDF/A)", + DocumentType.RFC); + doc.setPublisher(isoAgent); + doc.setDate("2003-11-30"); + doc.setIdentifier(new Identifier("ISO/CD 19005-1", IdentifierType.ISO)); + doc.setIdentifier(new Identifier( + "http://www.aiim.org/documents/standards/ISO_19005-1_(E).doc", + IdentifierType.URL)); + _specification.add(doc); + + _profile = new ArrayList(6); + _profile.add(new LinearizedProfile(this)); + TaggedProfile tpr = new TaggedProfile(this); + _profile.add(tpr); + + /* + * CURRENT PDF/A PROFILING UNFIT FOR PURPOSE; SEE GITHUB ISSUE #101. + * + * AProfile apr = new AProfile(this); + * _profile.add(apr); + * // Link AProfile to TaggedProfile to save checking + * // the former twice. + * apr.setTaggedProfile(tpr); + * + * AProfileLevelA apra = new AProfileLevelA(this); + * _profile.add(apra); + * // AProfileLevelA depends on AProfile + * apra.setAProfile(apr); + */ + + X1Profile x1 = new X1Profile(this); + _profile.add(x1); + X1aProfile x1a = new X1aProfile(this); + _profile.add(x1a); + // Linking the X1 profile to the X1a profile saves checking the former + // twice. + x1a.setX1Profile(x1); + _profile.add(new X2Profile(this)); + _profile.add(new X3Profile(this)); + + _showAnnotations = false; + _showFonts = false; + _showOutlines = false; + _showPages = false; + maxFonts = DEFAULT_MAX_FONTS; + } + + /****************************************************************** + * PUBLIC INSTANCE METHODS. + * + * Parsing methods. + ******************************************************************/ + + /** + * Reset parameter settings. + * Returns to a default state without any parameters. + */ + @Override + public void resetParams() { + _showAnnotations = true; + _showFonts = true; + _showOutlines = true; + _showPages = true; + maxFonts = DEFAULT_MAX_FONTS; + } + + /** + * Per-action initialization. May be called multiple times. + * + * @param param + * The module parameter; under command-line Jhove, the -p + * parameter. + * If the parameter contains the indicated characters, then the + * specified information is omitted; otherwise, it is included. + * (This is the reverse of the behavior prior to beta 3.) + * These characters may be provided as separate parameters, + * or all in a single parameter. + *
    + *
  • a: annotations
  • + *
  • f: fonts
  • + *
  • o: outlines
  • + *
  • p: pages
  • + *
+ *
+ * The parameter is case-independent. A null parameter is + * equivalent to the empty string. + */ + @Override + public void param(String param) { + if (param != null) { + param = param.toLowerCase(); + if (param.indexOf('a') >= 0) { + _showAnnotations = false; + } + if (param.indexOf('f') >= 0) { + _showFonts = false; + } + if (param.indexOf('o') >= 0) { + _showOutlines = false; + } + if (param.indexOf('p') >= 0) { + _showPages = false; + } + if (param.indexOf('n') >= 0) { + // Parse out the number after the n, and use that to set + // the maximum number of fonts reported. Default is + // DEFAULT_MAX_FONTS. + int n = param.indexOf('n'); + StringBuffer b = new StringBuffer(); + for (int i = n + 1; i < param.length(); i++) { + char ch = param.charAt(i); + if (Character.isDigit(ch)) { + b.append(ch); + } else { + break; + } + } + try { + int mx = Integer.parseInt(b.toString()); + if (mx > 0) { + maxFonts = mx; + } + } catch (Exception e) { + } + } + } + } + + /** + * Parses a file and stores descriptive information. A RandomAccessFile + * must be used to represent the object. + * + * @param raf + * A PDF file + * @param info + * A clean RepInfo object, which will be modified to hold + * the descriptive information + */ + @Override + public final void parse(RandomAccessFile raf, RepInfo info) + throws IOException { + initParse(); + initInfo(info); + _objects = new HashMap<>(); + _raf = raf; + + Tokenizer tok = new FileTokenizer(_raf); + _parser = new Parser(tok); + _parser.setObjectMap(_objects); + + List metadataList = new ArrayList(11); + /* + * We construct a big whopping property, + * which contains up to 11 subproperties + */ + _metadata = new Property(PROP_NAME_PDF_METADATA, PropertyType.PROPERTY, + PropertyArity.LIST, metadataList); + + if (_raf.length() > 10000000000L) { // that's 10^10 + _pdfACompliant = false; // doesn't meet size limit in Appendix C + // of PDF spec + } + if (!parseHeader(info)) { + return; + } + if (!findLastTrailer(info)) { + return; + } + + /* + * Walk through the linked trailer and cross reference + * sections. + */ + _prevxref = -1; + boolean lastTrailer = true; + while (_startxref > 0) { + // After the first (last) trailer, parse only for next "Prev" link + if (!parseTrailer(info, !lastTrailer)) { + return; + } + if (!readXRefInfo(info)) { + return; + } + ++_numTrailers; + if (_xrefIsStream) { + /* + * If we have an xref stream, readXRefInfo dealt with all + * the streams in a single call. + */ + break; + } + // Beware infinite loop on badly broken file + if (_startxref == _prevxref) { + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_134, // PDF-HUL-134 + _parser.getOffset())); + info.setWellFormed(false); + return; + } + _startxref = _prevxref; + lastTrailer = false; + } + if (!readDocCatalogDict(info)) { + return; + } + if (!readEncryptDict(info)) { + return; + } + if (!readDocInfoDict(info)) { + return; + } + if (!readDocumentTree(info)) { + return; + } + if (!readPageLabelTree(info)) { + return; + } + if (!readXMPData(info)) { + return; + } + findExternalStreams(info); + if (!findFilters(info) && !_streamsEncrypted) { + return; + } + findImages(info); + findFonts(info); + + /* Object is well-formed PDF. */ + + // Calculate checksums if not already present + checksumIfRafNotCopied(info, raf); + + info.setVersion(_version); + metadataList.add(new Property(PROP_NAME_OBJECTS, PropertyType.INTEGER, + new Integer(_numObjects))); + metadataList.add(new Property(PROP_NAME_FREE_OBJECTS, + PropertyType.INTEGER, new Integer(_numFreeObjects))); + metadataList.add(new Property(PROP_NAME_INC_UPDATES, + PropertyType.INTEGER, new Integer(_numTrailers))); + if (_docCatalogList != null) { + metadataList.add( + new Property(PROP_NAME_DOC_CATALOG, PropertyType.PROPERTY, + PropertyArity.LIST, _docCatalogList)); + } + if (_encryptList != null) { + metadataList.add(new Property(PROP_NAME_ENCRYPTION, + PropertyType.PROPERTY, PropertyArity.LIST, _encryptList)); + } + if (_docInfoList != null) { + metadataList.add(new Property(PROP_NAME_INFO, PropertyType.PROPERTY, + PropertyArity.LIST, _docInfoList)); + } + if (_idProperty != null) { + metadataList.add(_idProperty); + } + if (_extStreamsList != null && !_extStreamsList.isEmpty()) { + metadataList.add(new Property(PROP_NAME_EXTERNAL_STREAMS, + PropertyType.PROPERTY, PropertyArity.LIST, + _extStreamsList)); + } + if (_filtersList != null && !_filtersList.isEmpty()) { + metadataList.add(new Property(PROP_NAME_FILTERS, + PropertyType.PROPERTY, PropertyArity.LIST, _filtersList)); + } + if (_imagesList != null && !_imagesList.isEmpty()) { + metadataList.add(new Property(PROP_NAME_IMAGES, + PropertyType.PROPERTY, PropertyArity.LIST, _imagesList)); + } + if (_showFonts || _verbosity == Module.MAXIMUM_VERBOSITY) { + try { + addFontsProperty(metadataList); + } catch (NullPointerException e) { + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_135, + e.toString())); // PDF-HUL-135 + } + } + if (_nFonts > maxFonts) { + info.setMessage(new InfoMessage(MessageConstants.PDF_HUL_136, // PDF-HUL-136 + MessageConstants.PDF_HUL_136_SUB.getMessage() + _nFonts)); + } + if (_xmpProp != null) { + metadataList.add(_xmpProp); + } + addPagesProperty(metadataList, info); + + if (!doOutlineStuff(info)) { + return; + } + + info.setProperty(_metadata); + + /* Check for profile conformance. */ + + if (!_parser.getPDFACompliant()) { + _pdfACompliant = false; + } + if (info.getWellFormed() == RepInfo.TRUE) { + // Well-formedness is necessary to satisfy any profile. + ListIterator pter = _profile.listIterator(); + while (pter.hasNext()) { + PdfProfile prof = pter.next(); + if (prof.satisfiesProfile(_raf, _parser)) { + info.setProfile(prof.getText()); + } + } + } + } + + /** + * Returns true if the module hasn't detected any violations + * of PDF/A compliance. This must return true, but is not + * sufficient by itself, to establish compliance. The + * AProfile profiler makes the final determination. + */ + public boolean mayBePDFACompliant() { + return _pdfACompliant; + } + + /** + * Returns the document tree root. + */ + public PageTreeNode getDocumentTree() { + return _docTreeRoot; + } + + /** + * Returns the document information dictionary. + */ + public PdfDictionary getDocInfo() { + return _docInfoDict; + } + + /** + * Returns the encryption dictionary. + */ + public PdfDictionary getEncryptionDict() { + return _encryptDict; + } + + /** + * Return true if Actions have been detected in the file. + */ + public boolean getActionsExist() { + return _actionsExist; + } + + /** + * Initialize the module. This is called at the start + * of parse restore the module to its initial state. + */ + @Override + protected final void initParse() { + super.initParse(); + _xref = null; + _xref2 = null; + _version = ""; + _objects = null; + _numFreeObjects = 0; + _objCount = 0; + _docInfoList = null; + _extStreamsList = null; + _docCatalogList = null; + _encryptList = null; + _imagesList = null; + _filtersList = null; + _pagesList = null; + _type0FontsMap = null; + _type1FontsMap = null; + _mmFontsMap = null; + _type3FontsMap = null; + _trueTypeFontsMap = null; + _cid0FontsMap = null; + _cid2FontsMap = null; + _docCatDictRef = null; _encryptDictRef = null; - _docInfoDictRef = null; - _pagesDictRef = null; - _docCatDict = null; - _docInfoDict = null; - _docTreeRoot = null; - _pageLabelDict = null; - _encryptDict = null; - _trailerDict = null; - _viewPrefDict = null; - _outlineDict = null; - _destsDict = null; - _pageSeqMap = null; - _pageLabelRoot = null; - _embeddedFiles = null; - _destNames = null; - _skippedFontsReported = false; - _skippedOutlinesReported = false; - _skippedAnnotationsReported = false; - _skippedPagesReported = false; - _idProperty = null; - _actionsExist = false; - _numObjects = 0; - _numTrailers = -1; - _pdfACompliant = true; // assume compliance till disproven - _xmpProp = null; - _cachedStreamIndex = -1; - _nFonts = 0; - } - - protected boolean parseHeader(RepInfo info) throws IOException { - PdfHeader header = null; - try { - header = PdfHeader.parseHeader(_parser); - } catch (PdfMalformedException e) { - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_155, 0L)); // PDF-HUL-155 - return false; - } - if (header == null) { - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_137, 0L)); // PDF-HUL-137 - return false; - } - if (!header.isVersionValid()) { - info.setValid(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_148, 0L)); // PDF-HUL-148 - } - _version = header.getVersionString(); - _pdfACompliant = header.isPdfACompliant(); - info.setSigMatch(_name); - return true; - } - - private long lastEOFOffset(RandomAccessFile raf) throws IOException { - - long offset = 0; - long flen = 0; - byte[] buf = null; - - // overkill to restore fileposition, but make this - // as side-effect free as possible - long savepos = raf.getFilePointer(); - flen = raf.length(); - buf = new byte[(int) Math.min(EOFSCANSIZE, flen)]; - offset = flen - buf.length; - raf.seek(offset); - raf.read(buf); - raf.seek(savepos); - - // OK: - // flen is the total length of the file - // offset is 1024 bytes from the end of file or 0 if file is shorter - // than 1024 - // buf contains all bytes from offset to end of file - - long eofpos = -1; - // Note the limits, selected so the index never is out of bounds - for (int i = buf.length - 4; i >= 1; i--) { - if (buf[i] == '%') { - if ((buf[i - 1] == '%') && (buf[i + 1] == 'E') - && (buf[i + 2] == 'O') && (buf[i + 3] == 'F')) { - eofpos = offset + i - 1; - break; - } - } - } - - // if (Tracing.T_MODULE) System.out.println(flen - eofpos); - return eofpos; - - } - - private long lastStartXrefOffset(RandomAccessFile raf, long eofOffset) - throws IOException { - - long offset = 0; - long flen = 0; - byte[] buf = null; - - // overkill to restore fileposition, but make this - // as side-effect free as possible - long savepos = raf.getFilePointer(); - flen = raf.length(); - if (eofOffset <= 0) { - eofOffset = flen; - } - if (eofOffset >= flen) { - eofOffset = flen; - } - buf = new byte[(int) Math.min(XREFSCANSIZE, eofOffset)]; - offset = eofOffset - buf.length; - raf.seek(offset); - raf.read(buf); - raf.seek(savepos); - - // OK: - // flen is the total length of the file - // offset is 128 bytes from the end of file or 0 if file is shorter than - // 128 - // buf contains all bytes from offset to end of file - - long xrefpos = -1; - // Note the limits, selected so the index never is out of bounds - for (int i = buf.length - 9; i >= 0; i--) { - if (buf[i] == 's') { - if ((buf[i + 1] == 't') && (buf[i + 2] == 'a') - && (buf[i + 3] == 'r') && (buf[i + 4] == 't') - && (buf[i + 5] == 'x') && (buf[i + 6] == 'r') - && (buf[i + 7] == 'e') && (buf[i + 8] == 'f')) { - xrefpos = offset + i; - break; - } - } - } - - // if (Tracing.T_MODULE) System.out.println(flen - xrefpos); - return xrefpos; - - } - - /** Locate the last trailer of the file */ - protected boolean findLastTrailer(RepInfo info) throws IOException { - /* - * Parse file trailer. Technically, this should be the last thing in - * the file, but we follow the Acrobat convention of looking in the - * last 1024 bytes. Since incremental updates may add multiple - * EOF comments, make sure that we use the last one in the file. - */ - - Token token = null; - String value = null; - - _eof = lastEOFOffset(_raf); - - if (_eof < 0L) { - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_138, - _raf.length())); // PDF-HUL-138 - return false; - } - - // For PDF-A compliance, this must be at the very end. - /* - * Fix contributed by FCLA, 2007-05-30, to test for trailing data - * properly. - * - * if (_raf.length () - _eof > 6) { - */ - if (_raf.length() - _eof > 7) { - _pdfACompliant = false; - } - - /* Retrieve the "startxref" keyword. */ - - long startxrefoffset = lastStartXrefOffset(_raf, _eof); - _startxref = -1L; - - if (startxrefoffset >= 0) { - try { - _parser.seek(startxrefoffset); // points to the 'startxref' kw - // _parser.seek(_eof - 23); // should we allow more slop? - } catch (PdfException e) { - } - while (true) { - try { - token = _parser.getNext(); - } catch (Exception e) { - // we're starting at an arbitrary point, so there - // can be parsing errors. Ignore them till we get - // back in sync. - continue; - } - if (token == null) { - break; - } - if (token instanceof Keyword) { - value = ((Keyword) token).getValue(); - if (DICT_KEY_STARTXREF.equals(value)) { - try { - token = _parser.getNext(); - } catch (Exception e) { - break; // no excuses here - } - if (token != null && token instanceof Numeric) { - _startxref = ((Numeric) token).getLongValue(); - } - } - } - } - } - if (_startxref < 0L) { - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_139, // PDF-HUL-139 - _parser.getOffset())); - return false; - } - return true; - } - - /* - * Parse a "trailer" (which is not necessarily the last - * thing in the file, as trailers can be linked.) - */ - protected boolean parseTrailer(RepInfo info, boolean prevOnly) - throws IOException { - Token token = null; - String value = null; - /* Parse the trailer dictionary. */ - - try { - _parser.seek(_startxref); - /* - * The next object may be either the keyword "xref", signifying - * a classic cross-reference table, or a stream object, - * signifying the new-style cross-reference stream. - */ - Token xref = _parser.getNext(); - if (xref instanceof Keyword) { - _xrefIsStream = false; - _parser.getNext(Numeric.class, // PDF-HUL-68 - MessageConstants.PDF_HUL_68); // first obj number - - _objCount = ((Numeric) _parser.getNext(Numeric.class, // PDF-HUL-69 - MessageConstants.PDF_HUL_69)).getIntegerValue(); - _parser.seek(_parser.getOffset() + _objCount * 20); - } else if (xref instanceof Numeric) { - /* No cross-ref tables to backtrack. */ - _xrefIsStream = true; - _prevxref = -1; - /* - * But I do need to read the dictionary at this point, to get - * essential stuff out of it. - */ - PdfObject pdfStreamObj = _parser.readObjectDef((Numeric) xref); - // the retrieved object should be stream - if (!(pdfStreamObj instanceof PdfStream)) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_150, - _parser.getOffset()); - } - PdfDictionary dict = ((PdfStream) pdfStreamObj).getDict(); - _docCatDictRef = (PdfIndirectObj) dict.get(DICT_KEY_ROOT); - if (_docCatDictRef == null) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_70, // PDF-HUL-70 - _parser.getOffset()); - } - // readEncryptDict is not enough to check encryption when exists. - _encryptDictRef = (PdfIndirectObj) dict.get(DICT_KEY_ENCRYPT); - if (_encryptDictRef != null) { - _encrypted = true; - } - /* - * We don't need to see a trailer dictionary. - * Move along, move along. - */ - return true; - } - - /* Now find the "trailer" keyword. */ - long trailer = -1L; - while ((token = _parser.getNext()) != null) { - if (token instanceof Keyword) { - value = ((Keyword) token).getValue(); - if (DICT_KEY_TRAILER.equals(value)) { - token = _parser.getNext(); - if (token instanceof DictionaryStart) { - trailer = _parser.getOffset() - 7L; - break; - } - } - } - } - if (trailer < 0L) { - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_71, // PDF-HUL-71 - _parser.getOffset())); - return false; - } - - _trailerDict = _parser.readDictionary(); - PdfObject obj; - - // Extract contents of the trailer dictionary - - _prevxref = -1; - obj = _trailerDict.get(DICT_KEY_PREV); - if (obj != null) { - if (obj instanceof PdfSimpleObject) { - token = ((PdfSimpleObject) obj).getToken(); - if (token instanceof Numeric) - _prevxref = ((Numeric) token).getLongValue(); - } - if (_prevxref < 0) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_72, // PDF-HUL-72 - _parser.getOffset()); - } - } - // If this isn't the last (first read) trailer, then we - // ignore all the other dictionary entries. - if (prevOnly) { - return true; - } - - obj = _trailerDict.get(DICT_KEY_SIZE); - _docCatDictRef = (PdfIndirectObj) _trailerDict.get(DICT_KEY_ROOT); - if (obj != null) { - _numObjects = -1; - if (obj instanceof PdfSimpleObject) { - token = ((PdfSimpleObject) obj).getToken(); - if (token instanceof Numeric) { - _numObjects = ((Numeric) token).getIntegerValue(); - _xref = new long[_numObjects]; - } else { - throw new PdfInvalidException(MessageConstants.PDF_HUL_73, // PDF-HUL-73 - _parser.getOffset()); - } - } - if (_numObjects < 0) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_73, // PDF-HUL-73 - _parser.getOffset()); - } - if (_numObjects > 8388607) { - // Appendix C implementation limit is enforced by PDF/A - _pdfACompliant = false; - } - } else - throw new PdfInvalidException(MessageConstants.PDF_HUL_74, // PDF-HUL-74 - _parser.getOffset()); - - if (_docCatDictRef == null) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_75, // PDF-HUL-75 - _parser.getOffset()); - } - PdfObject encryptObj = _trailerDict.get(DICT_KEY_ENCRYPT); - if (encryptObj instanceof PdfIndirectObj) { + _docInfoDictRef = null; + _pagesDictRef = null; + _docCatDict = null; + _docInfoDict = null; + _docTreeRoot = null; + _pageLabelDict = null; + _encryptDict = null; + _trailerDict = null; + _viewPrefDict = null; + _outlineDict = null; + _destsDict = null; + _pageSeqMap = null; + _pageLabelRoot = null; + _embeddedFiles = null; + _destNames = null; + _skippedFontsReported = false; + _skippedOutlinesReported = false; + _skippedAnnotationsReported = false; + _skippedPagesReported = false; + _idProperty = null; + _actionsExist = false; + _numObjects = 0; + _numTrailers = -1; + _pdfACompliant = true; // assume compliance till disproven + _xmpProp = null; + _cachedStreamIndex = -1; + _nFonts = 0; + } + + protected boolean parseHeader(RepInfo info) throws IOException { + PdfHeader header = null; + try { + header = PdfHeader.parseHeader(_parser); + } catch (PdfMalformedException e) { + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_155, 0L)); // PDF-HUL-155 + return false; + } + if (header == null) { + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_137, 0L)); // PDF-HUL-137 + return false; + } + if (!header.isVersionValid()) { + info.setValid(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_148, 0L)); // PDF-HUL-148 + } + _version = header.getVersionString(); + _pdfACompliant = header.isPdfACompliant(); + info.setSigMatch(_name); + return true; + } + + private long lastEOFOffset(RandomAccessFile raf) throws IOException { + + long offset = 0; + long flen = 0; + byte[] buf = null; + + // overkill to restore fileposition, but make this + // as side-effect free as possible + long savepos = raf.getFilePointer(); + flen = raf.length(); + buf = new byte[(int) Math.min(EOFSCANSIZE, flen)]; + offset = flen - buf.length; + raf.seek(offset); + raf.read(buf); + raf.seek(savepos); + + // OK: + // flen is the total length of the file + // offset is 1024 bytes from the end of file or 0 if file is shorter + // than 1024 + // buf contains all bytes from offset to end of file + + long eofpos = -1; + // Note the limits, selected so the index never is out of bounds + for (int i = buf.length - 4; i >= 1; i--) { + if (buf[i] == '%') { + if ((buf[i - 1] == '%') && (buf[i + 1] == 'E') + && (buf[i + 2] == 'O') && (buf[i + 3] == 'F')) { + eofpos = offset + i - 1; + break; + } + } + } + + // if (Tracing.T_MODULE) System.out.println(flen - eofpos); + return eofpos; + + } + + private long lastStartXrefOffset(RandomAccessFile raf, long eofOffset) + throws IOException { + + long offset = 0; + long flen = 0; + byte[] buf = null; + + // overkill to restore fileposition, but make this + // as side-effect free as possible + long savepos = raf.getFilePointer(); + flen = raf.length(); + if (eofOffset <= 0) { + eofOffset = flen; + } + if (eofOffset >= flen) { + eofOffset = flen; + } + buf = new byte[(int) Math.min(XREFSCANSIZE, eofOffset)]; + offset = eofOffset - buf.length; + raf.seek(offset); + raf.read(buf); + raf.seek(savepos); + + // OK: + // flen is the total length of the file + // offset is 128 bytes from the end of file or 0 if file is shorter than + // 128 + // buf contains all bytes from offset to end of file + + long xrefpos = -1; + // Note the limits, selected so the index never is out of bounds + for (int i = buf.length - 9; i >= 0; i--) { + if (buf[i] == 's') { + if ((buf[i + 1] == 't') && (buf[i + 2] == 'a') + && (buf[i + 3] == 'r') && (buf[i + 4] == 't') + && (buf[i + 5] == 'x') && (buf[i + 6] == 'r') + && (buf[i + 7] == 'e') && (buf[i + 8] == 'f')) { + xrefpos = offset + i; + break; + } + } + } + + // if (Tracing.T_MODULE) System.out.println(flen - xrefpos); + return xrefpos; + + } + + /** Locate the last trailer of the file */ + protected boolean findLastTrailer(RepInfo info) throws IOException { + /* + * Parse file trailer. Technically, this should be the last thing in + * the file, but we follow the Acrobat convention of looking in the + * last 1024 bytes. Since incremental updates may add multiple + * EOF comments, make sure that we use the last one in the file. + */ + + Token token = null; + String value = null; + + _eof = lastEOFOffset(_raf); + + if (_eof < 0L) { + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_138, + _raf.length())); // PDF-HUL-138 + return false; + } + + // For PDF-A compliance, this must be at the very end. + /* + * Fix contributed by FCLA, 2007-05-30, to test for trailing data + * properly. + * + * if (_raf.length () - _eof > 6) { + */ + if (_raf.length() - _eof > 7) { + _pdfACompliant = false; + } + + /* Retrieve the "startxref" keyword. */ + + long startxrefoffset = lastStartXrefOffset(_raf, _eof); + _startxref = -1L; + + if (startxrefoffset >= 0) { + try { + _parser.seek(startxrefoffset); // points to the 'startxref' kw + // _parser.seek(_eof - 23); // should we allow more slop? + } catch (PdfException e) { + } + while (true) { + try { + token = _parser.getNext(); + } catch (Exception e) { + // we're starting at an arbitrary point, so there + // can be parsing errors. Ignore them till we get + // back in sync. + continue; + } + if (token == null) { + break; + } + if (token instanceof Keyword) { + value = ((Keyword) token).getValue(); + if (DICT_KEY_STARTXREF.equals(value)) { + try { + token = _parser.getNext(); + } catch (Exception e) { + break; // no excuses here + } + if (token != null && token instanceof Numeric) { + _startxref = ((Numeric) token).getLongValue(); + } + } + } + } + } + if (_startxref < 0L) { + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_139, // PDF-HUL-139 + _parser.getOffset())); + return false; + } + return true; + } + + /* + * Parse a "trailer" (which is not necessarily the last + * thing in the file, as trailers can be linked.) + */ + protected boolean parseTrailer(RepInfo info, boolean prevOnly) + throws IOException { + Token token = null; + String value = null; + /* Parse the trailer dictionary. */ + + try { + _parser.seek(_startxref); + /* + * The next object may be either the keyword "xref", signifying + * a classic cross-reference table, or a stream object, + * signifying the new-style cross-reference stream. + */ + Token xref = _parser.getNext(); + if (xref instanceof Keyword) { + _xrefIsStream = false; + _parser.getNext(Numeric.class, // PDF-HUL-68 + MessageConstants.PDF_HUL_68); // first obj number + + _objCount = ((Numeric) _parser.getNext(Numeric.class, // PDF-HUL-69 + MessageConstants.PDF_HUL_69)).getIntegerValue(); + _parser.seek(_parser.getOffset() + _objCount * 20); + } else if (xref instanceof Numeric) { + /* No cross-ref tables to backtrack. */ + _xrefIsStream = true; + _prevxref = -1; + /* + * But I do need to read the dictionary at this point, to get + * essential stuff out of it. + */ + PdfObject pdfStreamObj = _parser.readObjectDef((Numeric) xref); + // the retrieved object should be stream + if (!(pdfStreamObj instanceof PdfStream)) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_150, + _parser.getOffset()); + } + PdfDictionary dict = ((PdfStream) pdfStreamObj).getDict(); + _docCatDictRef = (PdfIndirectObj) dict.get(DICT_KEY_ROOT); + if (_docCatDictRef == null) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_70, // PDF-HUL-70 + _parser.getOffset()); + } + // readEncryptDict is not enough to check encryption when exists. + _encryptDictRef = (PdfIndirectObj) dict.get(DICT_KEY_ENCRYPT); + if (_encryptDictRef != null) { + _encrypted = true; + } + /* + * We don't need to see a trailer dictionary. + * Move along, move along. + */ + return true; + } + + /* Now find the "trailer" keyword. */ + long trailer = -1L; + while ((token = _parser.getNext()) != null) { + if (token instanceof Keyword) { + value = ((Keyword) token).getValue(); + if (DICT_KEY_TRAILER.equals(value)) { + token = _parser.getNext(); + if (token instanceof DictionaryStart) { + trailer = _parser.getOffset() - 7L; + break; + } + } + } + } + if (trailer < 0L) { + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_71, // PDF-HUL-71 + _parser.getOffset())); + return false; + } + + _trailerDict = _parser.readDictionary(); + PdfObject obj; + + // Extract contents of the trailer dictionary + + _prevxref = -1; + obj = _trailerDict.get(DICT_KEY_PREV); + if (obj != null) { + if (obj instanceof PdfSimpleObject) { + token = ((PdfSimpleObject) obj).getToken(); + if (token instanceof Numeric) + _prevxref = ((Numeric) token).getLongValue(); + } + if (_prevxref < 0) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_72, // PDF-HUL-72 + _parser.getOffset()); + } + } + // If this isn't the last (first read) trailer, then we + // ignore all the other dictionary entries. + if (prevOnly) { + return true; + } + + obj = _trailerDict.get(DICT_KEY_SIZE); + _docCatDictRef = (PdfIndirectObj) _trailerDict.get(DICT_KEY_ROOT); + if (obj != null) { + _numObjects = -1; + if (obj instanceof PdfSimpleObject) { + token = ((PdfSimpleObject) obj).getToken(); + if (token instanceof Numeric) { + _numObjects = ((Numeric) token).getIntegerValue(); + _xref = new long[_numObjects]; + } else { + throw new PdfInvalidException(MessageConstants.PDF_HUL_73, // PDF-HUL-73 + _parser.getOffset()); + } + } + if (_numObjects < 0) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_73, // PDF-HUL-73 + _parser.getOffset()); + } + if (_numObjects > 8388607) { + // Appendix C implementation limit is enforced by PDF/A + _pdfACompliant = false; + } + } else + throw new PdfInvalidException(MessageConstants.PDF_HUL_74, // PDF-HUL-74 + _parser.getOffset()); + + if (_docCatDictRef == null) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_75, // PDF-HUL-75 + _parser.getOffset()); + } + PdfObject encryptObj = _trailerDict.get(DICT_KEY_ENCRYPT); + if (encryptObj instanceof PdfIndirectObj) { _encryptDictRef = (PdfIndirectObj) _trailerDict .get(DICT_KEY_ENCRYPT); - } else if (encryptObj instanceof PdfDictionary) { + } else if (encryptObj instanceof PdfDictionary) { _encryptDict = (PdfDictionary) _trailerDict .get(DICT_KEY_ENCRYPT); - } + } _encrypted = (_encryptDictRef != null) || (_encryptDict != null); - PdfObject infoObj = _trailerDict.get(DICT_KEY_INFO); - if (infoObj != null && !(infoObj instanceof PdfIndirectObj)) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_76, // PDF-HUL-76 - _parser.getOffset()); - } - _docInfoDictRef = (PdfIndirectObj) infoObj; - - obj = _trailerDict.get(DICT_KEY_ID); // This is at least v. 1.1 - if (obj != null) { - if (obj instanceof PdfArray) { - String[] id = new String[2]; - try { - PdfArray idArray = (PdfArray) obj; - Vector idVec = idArray.getContent(); - if (idVec.size() != 2) { - throw new PdfInvalidException( - MessageConstants.PDF_HUL_77); // PDF-HUL-77 - } - PdfSimpleObject idobj = (PdfSimpleObject) idVec.get(0); - id[0] = toHex(((StringValuedToken) idobj.getToken()) - .getRawBytes()); - idobj = (PdfSimpleObject) idVec.get(1); - id[1] = toHex(((StringValuedToken) idobj.getToken()) - .getRawBytes()); - _idProperty = new Property(DICT_KEY_ID, - PropertyType.STRING, PropertyArity.ARRAY, id); - } catch (Exception e) { - throw new PdfInvalidException( - MessageConstants.PDF_HUL_78); // PDF-HUL-78 - } - } else { - throw new PdfInvalidException(MessageConstants.PDF_HUL_79, - _parser.getOffset()); // PDF-HUL-79 - } - } - obj = _trailerDict.get(DICT_KEY_XREF_STREAM); - if (obj != null) { - /* - * We have a "hybrid" cross-reference scheme. This means we have - * to go through the cross-reference stream and have its entries - * supplement the cross-reference section. - */ - _logger.warning("Hybrid cross-reference not yet implemented"); - } - } catch (PdfException e) { - - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // If it's merely invalid rather than ill-formed, keep going - return (e instanceof PdfInvalidException); - } - return true; - } - - /* Parses the cross-reference table or stream. */ - protected boolean readXRefInfo(RepInfo info) throws IOException { - if (_xrefIsStream) { - return readXRefStreams(info); - } - return readXRefTables(info); - } - - /* - * Parses the cross-reference streams. This is called from - * readXRefInfo if there is no cross-reference table. - * I still need to deal with hybrid cases. All linked cross-reference - * streams are handled here. - */ - protected boolean readXRefStreams(RepInfo info) throws IOException { - _pdfACompliant = false; // current version of PDF/A doesn't recognize - // XREF streams - while (_startxref > 0) { - try { - _parser.seek(_startxref); - PdfObject pdfStreamObj = _parser.readObjectDef(); - // the retrieved object should be stream - if (!(pdfStreamObj instanceof PdfStream)) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_150, - _parser.getOffset()); - } - PdfStream pstream = (PdfStream) pdfStreamObj; - int sObjNum = pstream.getObjNumber(); - CrossRefStream xstream = new CrossRefStream(pstream); - if (!xstream.isValid()) { - return false; - } - xstream.initRead(_raf); - int xrefSize = xstream.getCrossRefTableSize(); - if (_xref == null) { - _xref = new long[xrefSize]; - _xref2 = new int[xrefSize][]; - } - if (sObjNum < 0 || sObjNum >= xrefSize) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_80, // PDF-HUL-80 - _parser.getOffset()); - } - _xref[sObjNum] = _startxref; // insert the index of the xref - // stream itself - _startxref = xstream.getPrevXref(); - try { - while (xstream.readNextObject()) { - int objNum = xstream.getObjNum(); - if (xstream.isObjCompressed()) { - // Hold off on this branch - _xref[objNum] = -1; // defers to _xref2 - _xref2[objNum] = new int[] { - xstream.getContentStreamObjNum(), - xstream.getContentStreamIndex() }; - } else { - if (_xref[objNum] == 0) { - _xref[objNum] = xstream.getOffset(); - } - } - } - _numFreeObjects += xstream.getFreeCount(); - } catch (IOException e) { - info.setWellFormed(false); - info.setMessage( - new ErrorMessage(MessageConstants.PDF_HUL_81, // PDF-HUL-81 - _parser.getOffset())); - return false; - } - } catch (PdfException e) { - - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // If it's merely invalid rather than ill-formed, keep going - return (e instanceof PdfInvalidException); - } - } - return true; // incomplete, but let it through - } - - /* - * Parses the cross-reference table. This is called from - * readXRefInfo if there is a cross-reference table. - */ - protected boolean readXRefTables(RepInfo info) throws IOException { - Token token = null; - try { - _parser.seek(_startxref); - token = _parser.getNext(); // "xref" keyword or numeric - if (token instanceof Keyword) { - while ((token = _parser.getNext()) != null) { - int firstObj = 0; - // Look for the start of a cross-ref subsection, which - // begins with a base object number and a count. - if (token instanceof Numeric) { - firstObj = ((Numeric) token).getIntegerValue(); - } else { - // On anything else, assume we're done with this - // section. - // (Most likely we've hit the keyword "trailer". - break; - } - token = _parser.getNext(); - if (token instanceof Numeric) { - _objCount = ((Numeric) token).getIntegerValue(); - } - if (_xref == null) { - _xref = new long[_objCount]; - } - for (int i = 0; i < _objCount; i++) { - // In reading the cross-reference table, also check - // the extra syntactic requirements of PDF/A. - long offset = ((Numeric) _parser.getNext(Numeric.class, - MessageConstants.PDF_HUL_82)).getLongValue(); // PDF-HUL-82 - _parser.getNext(); // Generation number - if (_parser.getWSString().length() > 1) { - _pdfACompliant = false; - } - token = _parser.getNext(Keyword.class, - MessageConstants.PDF_HUL_83); // PDF-HUL-83 - if (_parser.getWSString().length() > 1) { - _pdfACompliant = false; - } - // A keyword of "n" signifies an object in use, - // "f" signifies a free object. If we already - // have an entry for this object, don't replace it. - String keyval = ((Keyword) token).getValue(); - if ("n".equals(keyval)) { - if (_xref[firstObj + i] == 0) { - _xref[firstObj + i] = offset; - } - } else if ("f".equals(keyval)) { - _numFreeObjects++; - } else { - throw new PdfMalformedException( - MessageConstants.PDF_HUL_84, // PDF-HUL-84 - _parser.getOffset()); - } - } - } - } - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - return false; - } catch (Exception e) { - info.setValid(false); + PdfObject infoObj = _trailerDict.get(DICT_KEY_INFO); + if (infoObj != null && !(infoObj instanceof PdfIndirectObj)) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_76, // PDF-HUL-76 + _parser.getOffset()); + } + _docInfoDictRef = (PdfIndirectObj) infoObj; + + obj = _trailerDict.get(DICT_KEY_ID); // This is at least v. 1.1 + if (obj != null) { + if (obj instanceof PdfArray) { + String[] id = new String[2]; + try { + PdfArray idArray = (PdfArray) obj; + Vector idVec = idArray.getContent(); + if (idVec.size() != 2) { + throw new PdfInvalidException( + MessageConstants.PDF_HUL_77); // PDF-HUL-77 + } + PdfSimpleObject idobj = (PdfSimpleObject) idVec.get(0); + id[0] = toHex(((StringValuedToken) idobj.getToken()) + .getRawBytes()); + idobj = (PdfSimpleObject) idVec.get(1); + id[1] = toHex(((StringValuedToken) idobj.getToken()) + .getRawBytes()); + _idProperty = new Property(DICT_KEY_ID, + PropertyType.STRING, PropertyArity.ARRAY, id); + } catch (Exception e) { + throw new PdfInvalidException( + MessageConstants.PDF_HUL_78); // PDF-HUL-78 + } + } else { + throw new PdfInvalidException(MessageConstants.PDF_HUL_79, + _parser.getOffset()); // PDF-HUL-79 + } + } + obj = _trailerDict.get(DICT_KEY_XREF_STREAM); + if (obj != null) { + /* + * We have a "hybrid" cross-reference scheme. This means we have + * to go through the cross-reference stream and have its entries + * supplement the cross-reference section. + */ + _logger.warning("Hybrid cross-reference not yet implemented"); + } + } catch (PdfException e) { + + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // If it's merely invalid rather than ill-formed, keep going + return (e instanceof PdfInvalidException); + } + return true; + } + + /* Parses the cross-reference table or stream. */ + protected boolean readXRefInfo(RepInfo info) throws IOException { + if (_xrefIsStream) { + return readXRefStreams(info); + } + return readXRefTables(info); + } + + /* + * Parses the cross-reference streams. This is called from + * readXRefInfo if there is no cross-reference table. + * I still need to deal with hybrid cases. All linked cross-reference + * streams are handled here. + */ + protected boolean readXRefStreams(RepInfo info) throws IOException { + _pdfACompliant = false; // current version of PDF/A doesn't recognize + // XREF streams + while (_startxref > 0) { + try { + _parser.seek(_startxref); + PdfObject pdfStreamObj = _parser.readObjectDef(); + // the retrieved object should be stream + if (!(pdfStreamObj instanceof PdfStream)) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_150, + _parser.getOffset()); + } + PdfStream pstream = (PdfStream) pdfStreamObj; + int sObjNum = pstream.getObjNumber(); + CrossRefStream xstream = new CrossRefStream(pstream); + if (!xstream.isValid()) { + return false; + } + xstream.initRead(_raf); + int xrefSize = xstream.getCrossRefTableSize(); + if (_xref == null) { + _xref = new long[xrefSize]; + _xref2 = new int[xrefSize][]; + } + if (sObjNum < 0 || sObjNum >= xrefSize) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_80, // PDF-HUL-80 + _parser.getOffset()); + } + _xref[sObjNum] = _startxref; // insert the index of the xref + // stream itself + _startxref = xstream.getPrevXref(); + try { + while (xstream.readNextObject()) { + int objNum = xstream.getObjNum(); + if (xstream.isObjCompressed()) { + // Hold off on this branch + _xref[objNum] = -1; // defers to _xref2 + _xref2[objNum] = new int[] { + xstream.getContentStreamObjNum(), + xstream.getContentStreamIndex() }; + } else { + if (_xref[objNum] == 0) { + _xref[objNum] = xstream.getOffset(); + } + } + } + _numFreeObjects += xstream.getFreeCount(); + } catch (IOException e) { + info.setWellFormed(false); + info.setMessage( + new ErrorMessage(MessageConstants.PDF_HUL_81, // PDF-HUL-81 + _parser.getOffset())); + return false; + } + } catch (PdfException e) { + + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // If it's merely invalid rather than ill-formed, keep going + return (e instanceof PdfInvalidException); + } + } + return true; // incomplete, but let it through + } + + /* + * Parses the cross-reference table. This is called from + * readXRefInfo if there is a cross-reference table. + */ + protected boolean readXRefTables(RepInfo info) throws IOException { + Token token = null; + try { + _parser.seek(_startxref); + token = _parser.getNext(); // "xref" keyword or numeric + if (token instanceof Keyword) { + while ((token = _parser.getNext()) != null) { + int firstObj = 0; + // Look for the start of a cross-ref subsection, which + // begins with a base object number and a count. + if (token instanceof Numeric) { + firstObj = ((Numeric) token).getIntegerValue(); + } else { + // On anything else, assume we're done with this + // section. + // (Most likely we've hit the keyword "trailer". + break; + } + token = _parser.getNext(); + if (token instanceof Numeric) { + _objCount = ((Numeric) token).getIntegerValue(); + } + if (_xref == null) { + _xref = new long[_objCount]; + } + for (int i = 0; i < _objCount; i++) { + // In reading the cross-reference table, also check + // the extra syntactic requirements of PDF/A. + long offset = ((Numeric) _parser.getNext(Numeric.class, + MessageConstants.PDF_HUL_82)).getLongValue(); // PDF-HUL-82 + _parser.getNext(); // Generation number + if (_parser.getWSString().length() > 1) { + _pdfACompliant = false; + } + token = _parser.getNext(Keyword.class, + MessageConstants.PDF_HUL_83); // PDF-HUL-83 + if (_parser.getWSString().length() > 1) { + _pdfACompliant = false; + } + // A keyword of "n" signifies an object in use, + // "f" signifies a free object. If we already + // have an entry for this object, don't replace it. + String keyval = ((Keyword) token).getValue(); + if ("n".equals(keyval)) { + if (_xref[firstObj + i] == 0) { + _xref[firstObj + i] = offset; + } + } else if ("f".equals(keyval)) { + _numFreeObjects++; + } else { + throw new PdfMalformedException( + MessageConstants.PDF_HUL_84, // PDF-HUL-84 + _parser.getOffset()); + } + } + } + } + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + return false; + } catch (Exception e) { + info.setValid(false); String mess = MessageFormat.format( MessageConstants.PDF_HUL_157.getMessage(), e.getClass().getName()); JhoveMessage message = JhoveMessages.getMessageInstance( MessageConstants.PDF_HUL_157.getId(), mess); - info.setMessage( + info.setMessage( new ErrorMessage(message, e.getMessage(), _parser.getOffset())); return false; - } - return true; - } - - private boolean readDocCatalogDict(RepInfo info) throws IOException { - Property p = null; - _docCatDict = null; - _docCatalogList = new ArrayList(2); - // Get the Root reference which we had before, and - // resolve it to the dictionary object. - if (_docCatDictRef == null) { - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_85, 0)); // PDF-HUL-85 - return false; - } - try { - _docCatDict = (PdfDictionary) resolveIndirectObject(_docCatDictRef); - } catch (Exception e) { - _logger.warning("Tried to cast non-dictionary to PdfDictionary"); - e.printStackTrace(); - } - if (_docCatDict == null) { - // If no object was returned, the PDF's not well-formed - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_86, 0)); // PDF-HUL-86 - return false; - } else if (_docCatDict.getObjNumber() != _docCatDictRef - .getObjNumber()) { - // If the returned object nmumber is not the same as that requested + } + return true; + } + + private boolean readDocCatalogDict(RepInfo info) throws IOException { + Property p = null; + _docCatDict = null; + _docCatalogList = new ArrayList(2); + // Get the Root reference which we had before, and + // resolve it to the dictionary object. + if (_docCatDictRef == null) { + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_85, 0)); // PDF-HUL-85 + return false; + } + try { + _docCatDict = (PdfDictionary) resolveIndirectObject(_docCatDictRef); + } catch (Exception e) { + _logger.warning("Tried to cast non-dictionary to PdfDictionary"); + e.printStackTrace(); + } + if (_docCatDict == null) { + // If no object was returned, the PDF's not well-formed + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_86, 0)); // PDF-HUL-86 + return false; + } else if (_docCatDict.getObjNumber() != _docCatDictRef + .getObjNumber()) { + // If the returned object nmumber is not the same as that requested if (_logger.isLoggable(Level.WARNING)) { _logger.warning("Inconsistent Document Catalog Object Number"); - _logger.warning(String.format( - " - /Root indirect reference number: %d, returned object ID: %d.", - _docCatDictRef.getObjNumber(), _docCatDict.getObjNumber())); - } - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_140, 0)); // PDF-HUL-140 - return false; - } - try { - // Check that the catalog has a key type and the types value is - // "Catalog" - if (!checkTypeKey(_docCatDict, info, KEY_VAL_CATALOG, - MessageConstants.PDF_HUL_141, // PDF-HUL-141 - MessageConstants.PDF_HUL_142, // PDF-HUL-142 - MessageConstants.PDF_HUL_143)) { // PDF-HUL-143 - return false; - } - - PdfObject viewPref = _docCatDict.get(DICT_KEY_VIEWER_PREFS); - viewPref = resolveIndirectObject(viewPref); - if (viewPref instanceof PdfDictionary) { - _viewPrefDict = (PdfDictionary) viewPref; - p = buildViewPrefProperty(_viewPrefDict); - _docCatalogList.add(p); - } - String pLayoutText = DEFAULT_PAGE_LAYOUT; // default - PdfObject pLayout = resolveIndirectObject( - _docCatDict.get(DICT_KEY_PAGE_LAYOUT)); - if (pLayout instanceof PdfSimpleObject) { - pLayoutText = ((PdfSimpleObject) pLayout).getStringValue(); - } - p = new Property(PROP_NAME_PAGE_LAYOUT, PropertyType.STRING, - pLayoutText); - _docCatalogList.add(p); - - String pModeText = DEFAULT_MODE; // default - PdfObject pMode = resolveIndirectObject( - _docCatDict.get(DICT_KEY_PAGE_MODE)); - if (pMode instanceof PdfSimpleObject) { - pModeText = ((PdfSimpleObject) pMode).getStringValue(); - } - p = new Property(DICT_KEY_PAGE_MODE, PropertyType.STRING, - pModeText); - _docCatalogList.add(p); - - if (!_encrypted) { - PdfObject outlines = resolveIndirectObject( - _docCatDict.get(DICT_KEY_OUTLINES)); - if (outlines instanceof PdfDictionary) { - _outlineDict = (PdfDictionary) outlines; - } - } - - PdfObject lang = resolveIndirectObject( - _docCatDict.get(DICT_KEY_LANG)); - if (lang != null && lang instanceof PdfSimpleObject) { - String langText = ((PdfSimpleObject) lang).getStringValue(); - p = new Property(PROP_NAME_LANG, PropertyType.STRING, - _encrypted ? ENCRYPTED : langText); - _docCatalogList.add(p); - } - - // The Pages dictionary doesn't go into the property, - // but this is a convenient time to grab it and the page label - // dictionary. - _pagesDictRef = (PdfIndirectObj) _docCatDict.get(DICT_KEY_PAGES); - if (!_encrypted) { - _pageLabelDict = (PdfDictionary) resolveIndirectObject( - _docCatDict.get(DICT_KEY_PAGE_LABELS)); - } - - // Grab the Version entry, and use it to override the - // file header IF it's later. - PdfObject vers = resolveIndirectObject( - _docCatDict.get(DICT_KEY_VERSION)); - if (vers instanceof PdfSimpleObject) { - String versString = ((PdfSimpleObject) vers).getStringValue(); - String infoVersString = _version; - try { - double ver = Double.parseDouble(versString); - double infoVer = Double.parseDouble(infoVersString); - /* Set a message if this doesn't agree with RepInfo */ - if (ver != infoVer) { - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_87.getMessage(), - infoVersString, versString); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_87.getId(), mess); - info.setMessage(new InfoMessage(message)); - } - /* Replace the version in RepInfo if this is larger */ - if (ver > infoVer) { - _version = versString; - } - } catch (NumberFormatException e) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_88); // PDF-HUL-88 - } - } - - // If extensions are defined get the extensionlevel information and the - // baseVersion from the extensions - PdfObject extensions = _docCatDict.get(DICT_KEY_EXTENSIONS); - if (extensions != null) { - if (extensions instanceof PdfDictionary) { - Iterator extensionsIter = ((PdfDictionary) extensions).iterator(); - while (extensionsIter.hasNext()) { - - PdfObject extensionObj = extensionsIter.next(); - // Arlington PDF Model defines extension as a direct object - // https://github.com/pdf-association/arlington-pdf-model/blob/master/tsv/latest/Extensions.tsv - if(extensionObj instanceof PdfIndirectObj) { - info.setWellFormed(false); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_156.getId(), - MessageConstants.PDF_HUL_156.getMessage()); - info.setMessage(new ErrorMessage(message)); // PDF-HUL-156 - } else { - PdfDictionary extension = (PdfDictionary) extensionObj; - Set developerPrefixKeys = ((PdfDictionary) extensions).getKeys(); - for (String developerPrefixKey : developerPrefixKeys) { - if (PdfStrings.PREFIXNAMESREGISTY.contains(developerPrefixKey.toString())) { - p = new Property(PROP_NAME_DEVELOPERPREFIX, PropertyType.STRING, - developerPrefixKey.toString()); - _docCatalogList.add(p); - PdfSimpleObject BaseVersion = (PdfSimpleObject) extension.get(DICT_KEY_BASEVERSION); - String infoVersString = _version; - String versString = BaseVersion.getStringValue(); - double ver = Double.parseDouble(versString); - double infoVer = Double.parseDouble(infoVersString); - try { - if (infoVer != ver) { - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_87.getMessage(), - infoVersString, ver); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_87.getId(), mess); - info.setMessage(new InfoMessage(message)); - } else { - p = new Property(PROP_NAME_BASEVERSION, PropertyType.STRING, ver); - _docCatalogList.add(p); - } - } catch (NumberFormatException e) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_88); // PDF-HUL-88 - } - PdfSimpleObject extensionLevel = (PdfSimpleObject) extension - .get(DICT_KEY_EXTENSIONLEVEL); - if (extensionLevel != null) { - p = new Property(PROP_NAME_EXTENSIONLEVEL, PropertyType.INTEGER, - extensionLevel.getIntValue()); - _docCatalogList.add(p); - } - } else { - // There is an unknown developer prefix - info.setMessage(new InfoMessage(MessageConstants.PDF_HUL_154, - developerPrefixKey.toString())); // PDF-HUL-154 - } - } - } - } - } - } - - // Get the Names dictionary in order to grab the - // EmbeddedFiles and Dests entries. - try { - PdfDictionary namesDict = null; - if (!_encrypted) { - namesDict = (PdfDictionary) resolveIndirectObject( - _docCatDict.get(DICT_KEY_NAMES)); - } - if (namesDict != null) { - PdfDictionary embeddedDict = (PdfDictionary) resolveIndirectObject( - namesDict.get(DICT_KEY_EMBEDDED_FILES)); - if (embeddedDict != null) { - _embeddedFiles = new NameTreeNode(this, null, - embeddedDict); - } - - PdfDictionary dDict = (PdfDictionary) resolveIndirectObject( - namesDict.get(DICT_KEY_DESTS)); - if (dDict != null) { - _destNames = new NameTreeNode(this, null, dDict); - } - } - } catch (ClassCastException ce) { - _logger.info("ClassCastException on names dictionary"); - throw new PdfInvalidException(MessageConstants.PDF_HUL_89); // PDF-HUL-89 - } catch (Exception e) { - _logger.info("Exception on names dictionary: " - + e.getClass().getName()); - throw new PdfMalformedException(MessageConstants.PDF_HUL_90); // PDF-HUL-90 - } - - // Get the optional Dests dictionary. Note that destinations - // may be specified in either of two completely different - // ways: a dictionary here, or a name tree from the Names - // dictionary. - - try { - _destsDict = (PdfDictionary) resolveIndirectObject( - _docCatDict.get(DICT_KEY_DESTS)); - } catch (ClassCastException ce) { - _logger.info("ClassCastException on dests dictionary"); - throw new PdfInvalidException(MessageConstants.PDF_HUL_91); // PDF-HUL-91 - } catch (Exception e) { - _logger.info("Exception on dests dictionary: " - + e.getClass().getName()); - throw new PdfMalformedException(MessageConstants.PDF_HUL_92); // PDF-HUL-92 - } - } - - catch (PdfException e) { - e.disparage(info); // clears Valid or WellFormed as appropriate - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // Keep going if it's only invalid - return (e instanceof PdfInvalidException); - } catch (Exception e) { - // Unexpected exception -- declare not well-formed - info.setWellFormed(false); - info.setValid(false); - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_158.getMessage(), - e.getClass().getName()); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_158.getId(), mess); - info.setMessage( - new ErrorMessage(message, e.getMessage(), _parser.getOffset())); - return false; - } - return true; - } - - protected boolean readEncryptDict(RepInfo info) throws IOException { - String filterText = ""; - String effText = null; - // Get the reference which we had before, and - // resolve it to the dictionary object. + _logger.warning(String.format( + " - /Root indirect reference number: %d, returned object ID: %d.", + _docCatDictRef.getObjNumber(), _docCatDict.getObjNumber())); + } + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_140, 0)); // PDF-HUL-140 + return false; + } + try { + // Check that the catalog has a key type and the types value is + // "Catalog" + if (!checkTypeKey(_docCatDict, info, KEY_VAL_CATALOG, + MessageConstants.PDF_HUL_141, // PDF-HUL-141 + MessageConstants.PDF_HUL_142, // PDF-HUL-142 + MessageConstants.PDF_HUL_143)) { // PDF-HUL-143 + return false; + } + + PdfObject viewPref = _docCatDict.get(DICT_KEY_VIEWER_PREFS); + viewPref = resolveIndirectObject(viewPref); + if (viewPref instanceof PdfDictionary) { + _viewPrefDict = (PdfDictionary) viewPref; + p = buildViewPrefProperty(_viewPrefDict); + _docCatalogList.add(p); + } + String pLayoutText = DEFAULT_PAGE_LAYOUT; // default + PdfObject pLayout = resolveIndirectObject( + _docCatDict.get(DICT_KEY_PAGE_LAYOUT)); + if (pLayout instanceof PdfSimpleObject) { + pLayoutText = ((PdfSimpleObject) pLayout).getStringValue(); + } + p = new Property(PROP_NAME_PAGE_LAYOUT, PropertyType.STRING, + pLayoutText); + _docCatalogList.add(p); + + String pModeText = DEFAULT_MODE; // default + PdfObject pMode = resolveIndirectObject( + _docCatDict.get(DICT_KEY_PAGE_MODE)); + if (pMode instanceof PdfSimpleObject) { + pModeText = ((PdfSimpleObject) pMode).getStringValue(); + } + p = new Property(DICT_KEY_PAGE_MODE, PropertyType.STRING, + pModeText); + _docCatalogList.add(p); + + if (!_encrypted) { + PdfObject outlines = resolveIndirectObject( + _docCatDict.get(DICT_KEY_OUTLINES)); + if (outlines instanceof PdfDictionary) { + _outlineDict = (PdfDictionary) outlines; + } + } + + PdfObject lang = resolveIndirectObject( + _docCatDict.get(DICT_KEY_LANG)); + if (lang != null && lang instanceof PdfSimpleObject) { + String langText = ((PdfSimpleObject) lang).getStringValue(); + p = new Property(PROP_NAME_LANG, PropertyType.STRING, + _encrypted ? ENCRYPTED : langText); + _docCatalogList.add(p); + } + + // The Pages dictionary doesn't go into the property, + // but this is a convenient time to grab it and the page label + // dictionary. + _pagesDictRef = (PdfIndirectObj) _docCatDict.get(DICT_KEY_PAGES); + if (!_encrypted) { + _pageLabelDict = (PdfDictionary) resolveIndirectObject( + _docCatDict.get(DICT_KEY_PAGE_LABELS)); + } + + // Grab the Version entry, and use it to override the + // file header IF it's later. + PdfObject vers = resolveIndirectObject( + _docCatDict.get(DICT_KEY_VERSION)); + if (vers instanceof PdfSimpleObject) { + String versString = ((PdfSimpleObject) vers).getStringValue(); + String infoVersString = _version; + try { + double ver = Double.parseDouble(versString); + double infoVer = Double.parseDouble(infoVersString); + /* Set a message if this doesn't agree with RepInfo */ + if (ver != infoVer) { + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_87.getMessage(), + infoVersString, versString); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_87.getId(), mess); + info.setMessage(new InfoMessage(message)); + } + /* Replace the version in RepInfo if this is larger */ + if (ver > infoVer) { + _version = versString; + } + } catch (NumberFormatException e) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_88); // PDF-HUL-88 + } + } + + // If extensions are defined get the extensionlevel information and the + // baseVersion from the extensions + PdfObject extensions = _docCatDict.get(DICT_KEY_EXTENSIONS); + if (extensions != null) { + if (extensions instanceof PdfDictionary) { + Iterator extensionsIter = ((PdfDictionary) extensions).iterator(); + while (extensionsIter.hasNext()) { + + PdfObject extensionObj = extensionsIter.next(); + // Arlington PDF Model defines extension as a direct object + // https://github.com/pdf-association/arlington-pdf-model/blob/master/tsv/latest/Extensions.tsv + if (extensionObj instanceof PdfIndirectObj) { + info.setWellFormed(false); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_156.getId(), + MessageConstants.PDF_HUL_156.getMessage()); + info.setMessage(new ErrorMessage(message)); // PDF-HUL-156 + } else { + PdfDictionary extension = (PdfDictionary) extensionObj; + Set developerPrefixKeys = ((PdfDictionary) extensions).getKeys(); + for (String developerPrefixKey : developerPrefixKeys) { + if (PdfStrings.PREFIXNAMESREGISTY.contains(developerPrefixKey.toString())) { + p = new Property(PROP_NAME_DEVELOPERPREFIX, PropertyType.STRING, + developerPrefixKey.toString()); + _docCatalogList.add(p); + PdfSimpleObject BaseVersion = (PdfSimpleObject) extension.get(DICT_KEY_BASEVERSION); + String infoVersString = _version; + String versString = BaseVersion.getStringValue(); + double ver = Double.parseDouble(versString); + double infoVer = Double.parseDouble(infoVersString); + try { + if (infoVer != ver) { + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_87.getMessage(), + infoVersString, ver); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_87.getId(), mess); + info.setMessage(new InfoMessage(message)); + } else { + p = new Property(PROP_NAME_BASEVERSION, PropertyType.STRING, ver); + _docCatalogList.add(p); + } + } catch (NumberFormatException e) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_88); // PDF-HUL-88 + } + PdfSimpleObject extensionLevel = (PdfSimpleObject) extension + .get(DICT_KEY_EXTENSIONLEVEL); + if (extensionLevel != null) { + p = new Property(PROP_NAME_EXTENSIONLEVEL, PropertyType.INTEGER, + extensionLevel.getIntValue()); + _docCatalogList.add(p); + } + } else { + // There is an unknown developer prefix + info.setMessage(new InfoMessage(MessageConstants.PDF_HUL_154, + developerPrefixKey.toString())); // PDF-HUL-154 + } + } + } + } + } + } + + // Get the Names dictionary in order to grab the + // EmbeddedFiles and Dests entries. + try { + PdfDictionary namesDict = null; + if (!_encrypted) { + namesDict = (PdfDictionary) resolveIndirectObject( + _docCatDict.get(DICT_KEY_NAMES)); + } + if (namesDict != null) { + PdfDictionary embeddedDict = (PdfDictionary) resolveIndirectObject( + namesDict.get(DICT_KEY_EMBEDDED_FILES)); + if (embeddedDict != null) { + _embeddedFiles = new NameTreeNode(this, null, + embeddedDict); + } + + PdfDictionary dDict = (PdfDictionary) resolveIndirectObject( + namesDict.get(DICT_KEY_DESTS)); + if (dDict != null) { + _destNames = new NameTreeNode(this, null, dDict); + } + } + } catch (ClassCastException ce) { + _logger.info("ClassCastException on names dictionary"); + throw new PdfInvalidException(MessageConstants.PDF_HUL_89); // PDF-HUL-89 + } catch (Exception e) { + _logger.info("Exception on names dictionary: " + + e.getClass().getName()); + throw new PdfMalformedException(MessageConstants.PDF_HUL_90); // PDF-HUL-90 + } + + // Get the optional Dests dictionary. Note that destinations + // may be specified in either of two completely different + // ways: a dictionary here, or a name tree from the Names + // dictionary. + + try { + _destsDict = (PdfDictionary) resolveIndirectObject( + _docCatDict.get(DICT_KEY_DESTS)); + } catch (ClassCastException ce) { + _logger.info("ClassCastException on dests dictionary"); + throw new PdfInvalidException(MessageConstants.PDF_HUL_91); // PDF-HUL-91 + } catch (Exception e) { + _logger.info("Exception on dests dictionary: " + + e.getClass().getName()); + throw new PdfMalformedException(MessageConstants.PDF_HUL_92); // PDF-HUL-92 + } + } + + catch (PdfException e) { + e.disparage(info); // clears Valid or WellFormed as appropriate + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // Keep going if it's only invalid + return (e instanceof PdfInvalidException); + } catch (Exception e) { + // Unexpected exception -- declare not well-formed + info.setWellFormed(false); + info.setValid(false); + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_158.getMessage(), + e.getClass().getName()); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_158.getId(), mess); + info.setMessage( + new ErrorMessage(message, e.getMessage(), _parser.getOffset())); + return false; + } + return true; + } + + protected boolean readEncryptDict(RepInfo info) throws IOException { + String filterText = ""; + String effText = null; + // Get the reference which we had before, and + // resolve it to the dictionary object. if (_encryptDictRef == null && _encryptDict == null) { - return true; // encryption entry is optional - } - try { - _encryptList = new ArrayList(6); + return true; // encryption entry is optional + } + try { + _encryptList = new ArrayList(6); if (_encryptDict == null) { _encryptDict = (PdfDictionary) resolveIndirectObject(_encryptDictRef); } PdfObject filter = _encryptDict.get(DICT_KEY_FILTER); - if (filter instanceof PdfSimpleObject) { - Token tok = ((PdfSimpleObject) filter).getToken(); - if (tok instanceof Name) { - filterText = ((Name) tok).getValue(); - } - } - Property p = new Property(PROP_NAME_SECURITY_HANDLER, - PropertyType.STRING, filterText); - _encryptList.add(p); - // PdfObject eff = dict.get("EFF"); - if (filter instanceof PdfSimpleObject) { - Token tok = ((PdfSimpleObject) filter).getToken(); - if (tok instanceof Name) { - effText = ((Name) tok).getValue(); - } - } - if (effText != null) { - p = new Property(PROP_NAME_EFF, PropertyType.STRING, effText); - _encryptList.add(p); - } - - int algValue = 0; + if (filter instanceof PdfSimpleObject) { + Token tok = ((PdfSimpleObject) filter).getToken(); + if (tok instanceof Name) { + filterText = ((Name) tok).getValue(); + } + } + Property p = new Property(PROP_NAME_SECURITY_HANDLER, + PropertyType.STRING, filterText); + _encryptList.add(p); + // PdfObject eff = dict.get("EFF"); + if (filter instanceof PdfSimpleObject) { + Token tok = ((PdfSimpleObject) filter).getToken(); + if (tok instanceof Name) { + effText = ((Name) tok).getValue(); + } + } + if (effText != null) { + p = new Property(PROP_NAME_EFF, PropertyType.STRING, effText); + _encryptList.add(p); + } + + int algValue = 0; PdfObject algorithm = _encryptDict.get(DICT_KEY_V); - if (algorithm instanceof PdfSimpleObject) { - Token tok = ((PdfSimpleObject) algorithm).getToken(); - if (tok instanceof Numeric) { - algValue = ((Numeric) tok).getIntegerValue(); - if (_je != null && _je.getShowRawFlag()) { - p = new Property(PROP_NAME_ALGORITHM, - PropertyType.INTEGER, new Integer(algValue)); - } else { - try { - p = new Property(PROP_NAME_ALGORITHM, - PropertyType.STRING, - PdfStrings.ALGORITHM[algValue]); - } catch (ArrayIndexOutOfBoundsException aioobe) { - throw new PdfInvalidException // PDF-HUL-93 - (MessageConstants.PDF_HUL_93, _parser.getOffset()); - } - } - if (p != null) { - _encryptList.add(p); - } - } - } - - int keyLen = 40; + if (algorithm instanceof PdfSimpleObject) { + Token tok = ((PdfSimpleObject) algorithm).getToken(); + if (tok instanceof Numeric) { + algValue = ((Numeric) tok).getIntegerValue(); + if (_je != null && _je.getShowRawFlag()) { + p = new Property(PROP_NAME_ALGORITHM, + PropertyType.INTEGER, new Integer(algValue)); + } else { + try { + p = new Property(PROP_NAME_ALGORITHM, + PropertyType.STRING, + PdfStrings.ALGORITHM[algValue]); + } catch (ArrayIndexOutOfBoundsException aioobe) { + throw new PdfInvalidException // PDF-HUL-93 + (MessageConstants.PDF_HUL_93, _parser.getOffset()); + } + } + if (p != null) { + _encryptList.add(p); + } + } + } + + int keyLen = 40; PdfObject length = _encryptDict.get(DICT_KEY_LENGTH); - if (length instanceof PdfSimpleObject) { - Token tok = ((PdfSimpleObject) length).getToken(); - if (tok instanceof Numeric) { - keyLen = ((Numeric) tok).getIntegerValue(); - } - if (_je != null) { - p = new Property(PROP_NAME_KEY_LENGTH, PropertyType.INTEGER, - new Integer(keyLen)); - _encryptList.add(p); - } - } - - if (FILTER_VAL_STANDARD.equals(filterText)) { - List stdList = new ArrayList(4); - // Flags have a known meaning only if Standard - // security handler was specified + if (length instanceof PdfSimpleObject) { + Token tok = ((PdfSimpleObject) length).getToken(); + if (tok instanceof Numeric) { + keyLen = ((Numeric) tok).getIntegerValue(); + } + if (_je != null) { + p = new Property(PROP_NAME_KEY_LENGTH, PropertyType.INTEGER, + new Integer(keyLen)); + _encryptList.add(p); + } + } + + if (FILTER_VAL_STANDARD.equals(filterText)) { + List stdList = new ArrayList(4); + // Flags have a known meaning only if Standard + // security handler was specified PdfObject flagObj = _encryptDict.get(DICT_KEY_P); PdfObject revObj = _encryptDict.get(DICT_KEY_R); - int rev = 2; // assume old rev if not present - if (revObj instanceof PdfSimpleObject) { - rev = ((PdfSimpleObject) revObj).getIntValue(); - } - if (flagObj instanceof PdfSimpleObject) { - int flags = ((PdfSimpleObject) flagObj).getIntValue(); - String[] flagStrs; - if (rev == 2) { - flagStrs = PdfStrings.USERPERMFLAGS2; - } else { - flagStrs = PdfStrings.USERPERMFLAGS3; - } - p = buildUserPermProperty(flags, flagStrs); - stdList.add(p); - - stdList.add(new Property(PROP_NAME_REVISION, - PropertyType.INTEGER, new Integer(rev))); - } + int rev = 2; // assume old rev if not present + if (revObj instanceof PdfSimpleObject) { + rev = ((PdfSimpleObject) revObj).getIntValue(); + } + if (flagObj instanceof PdfSimpleObject) { + int flags = ((PdfSimpleObject) flagObj).getIntValue(); + String[] flagStrs; + if (rev == 2) { + flagStrs = PdfStrings.USERPERMFLAGS2; + } else { + flagStrs = PdfStrings.USERPERMFLAGS3; + } + p = buildUserPermProperty(flags, flagStrs); + stdList.add(p); + + stdList.add(new Property(PROP_NAME_REVISION, + PropertyType.INTEGER, new Integer(rev))); + } PdfObject oObj = _encryptDict.get("O"); - if (oObj != null) { - if (oObj instanceof PdfSimpleObject) { - stdList.add(new Property(PROP_NAME_OWNER_STRING, - PropertyType.STRING, - toHex(((PdfSimpleObject) oObj).getRawBytes()))); - } - } + if (oObj != null) { + if (oObj instanceof PdfSimpleObject) { + stdList.add(new Property(PROP_NAME_OWNER_STRING, + PropertyType.STRING, + toHex(((PdfSimpleObject) oObj).getRawBytes()))); + } + } PdfObject uObj = _encryptDict.get("U"); - if (uObj != null) { - if (uObj instanceof PdfSimpleObject) { - stdList.add(new Property(PROP_NAME_USER_STRING, - PropertyType.STRING, - toHex(((PdfSimpleObject) uObj).getRawBytes()))); - } - } - // Required if ExtensionLevel 3 and Encryption Algorithm (V) is 5 - // Defined in Adobe® Supplement to the ISO 32000 - if (algValue == 5) { + if (uObj != null) { + if (uObj instanceof PdfSimpleObject) { + stdList.add(new Property(PROP_NAME_USER_STRING, + PropertyType.STRING, + toHex(((PdfSimpleObject) uObj).getRawBytes()))); + } + } + // Required if ExtensionLevel 3 and Encryption Algorithm (V) is 5 + // Defined in Adobe® Supplement to the ISO 32000 + if (algValue == 5) { PdfObject oeObj = _encryptDict.get("OE"); - if (oeObj != null) { - if (oeObj instanceof PdfSimpleObject) { - stdList.add(new Property(PROP_NAME_OWNERKEY_STRING, - PropertyType.STRING, - toHex(((PdfSimpleObject) oeObj).getRawBytes()))); - } - } else { - // if algValue is 5; OE is mandatory - throw new PdfInvalidException(MessageConstants.PDF_HUL_152, _parser.getOffset()); - } + if (oeObj != null) { + if (oeObj instanceof PdfSimpleObject) { + stdList.add(new Property(PROP_NAME_OWNERKEY_STRING, + PropertyType.STRING, + toHex(((PdfSimpleObject) oeObj).getRawBytes()))); + } + } else { + // if algValue is 5; OE is mandatory + throw new PdfInvalidException(MessageConstants.PDF_HUL_152, _parser.getOffset()); + } PdfObject ueObj = _encryptDict.get("UE"); - if (ueObj != null) { - if (ueObj instanceof PdfSimpleObject) { - stdList.add(new Property(PROP_NAME_USERKEY_STRING, - PropertyType.STRING, - toHex(((PdfSimpleObject) ueObj).getRawBytes()))); - } - } else { - // if algValue is 5; UE is mandatory - throw new PdfInvalidException(MessageConstants.PDF_HUL_153, _parser.getOffset()); - } - } - _encryptList.add(new Property( - PROP_NAME_STANDARD_SECURITY_HANDLER, - PropertyType.PROPERTY, PropertyArity.LIST, stdList)); - } - PdfObject streamEncrypted = _encryptDict.get(DICT_KEY_STMF); - if (streamEncrypted instanceof PdfSimpleObject) { - _streamsEncrypted = true; - } - - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - return (e instanceof PdfInvalidException); - } - return true; - } - - protected boolean readDocInfoDict(RepInfo info) { - // Get the Info reference which we had before, and - // resolve it to the dictionary object. - if (_docInfoDictRef == null) { - return true; // Info is optional - } - _docInfoList = new ArrayList(9); - try { - _docInfoDict = (PdfDictionary) resolveIndirectObject( - _docInfoDictRef); - addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_TITLE, - PROP_NAME_TITLE); - addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_AUTHOR, - PROP_NAME_AUTHOR); - addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_SUBJECT, - PROP_NAME_SUBJECT); - addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_KEYWORDS, - PROP_NAME_KEYWORDS); - addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_CREATOR, - PROP_NAME_CREATOR); - addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_PRODUCER, - PROP_NAME_PRODUCER); - - // CreationDate requires string-to-date conversion - // ModDate does too - addDateProperty(_docInfoDict, _docInfoList, DICT_KEY_CREATION_DATE, - PROP_NAME_CREATION_DATE); - addDateProperty(_docInfoDict, _docInfoList, DICT_KEY_MODIFIED_DATE, - PROP_NAME_MODIFIED_DATE); - addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_TRAPPED, - PROP_NAME_TRAPPED); - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // Keep parsing if it's only invalid - return (e instanceof PdfInvalidException); - } catch (Exception e) { - info.setWellFormed(false); - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_94.getMessage(), - e.getClass().getName()); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_94.getId(), mess); - info.setMessage(new ErrorMessage(message)); // PDF-HUL-94 - } - return true; - } - - protected boolean readDocumentTree(RepInfo info) { - try { - if (_pagesDictRef == null) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_95); // PDF-HUL-95 - } - - PdfObject pagesObj = resolveIndirectObject(_pagesDictRef); - if (pagesObj != null && !(pagesObj instanceof PdfDictionary)) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_97); // PDF-HUL-97 - } else if (pagesObj != null) { - - PdfDictionary pagesDict = (PdfDictionary) pagesObj; - - // Check that the pages dict has a key type and the types value is - // Pages - if (!checkTypeKey(pagesDict, info, KEY_VAL_PAGES, - MessageConstants.PDF_HUL_146, // PDF-HUL-146 - MessageConstants.PDF_HUL_144, // PDF-HUL-144 - MessageConstants.PDF_HUL_145)) { // PDF-HUL-145 - return false; - } - - _docTreeRoot = new PageTreeNode(this, null, pagesDict); - _docTreeRoot.buildSubtree(true, MAX_PAGE_TREE_DEPTH); - } - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // Continue parsing if it's only invalid - return (e instanceof PdfInvalidException); - } catch (ArrayIndexOutOfBoundsException excep) { - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_96, - _parser.getOffset())); // PDF-HUL-96 - info.setWellFormed(false); - return false; - } catch (Exception e) { - // Catch any odd exceptions - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_98.getMessage(), - e.getClass().getName()); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_98.getId(), mess); - info.setMessage(new ErrorMessage(message, _parser.getOffset())); // PDF-HUL-98 - info.setWellFormed(false); - return false; - } - return true; - } - - protected boolean readPageLabelTree(RepInfo info) { - // the page labels number tree is optional. - try { - if (_pageLabelDict != null) { - _pageLabelRoot = new PageLabelNode(this, null, _pageLabelDict); - _pageLabelRoot.buildSubtree(); - } - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // Continue parsing if it's only invalid - return (e instanceof PdfInvalidException); - } catch (Exception e) { - info.setWellFormed(false); - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_99.getMessage(), - e.getClass().getName()); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_99.getId(), mess); - info.setMessage(new ErrorMessage(message)); // PDF-HUL-99 - return false; - } - return true; // always succeeds - } - - protected boolean readXMPData(RepInfo info) { - try { - PdfStream metadata = (PdfStream) resolveIndirectObject( - _docCatDict.get(DICT_KEY_METADATA)); - if (metadata == null) { - return true; // Not required - } - // PdfDictionary metaDict = metadata.getDict (); - - // Create an InputSource to feed the parser. - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - XMLReader parser = factory.newSAXParser().getXMLReader(); - PdfXMPSource src = new PdfXMPSource(metadata, getFile()); - XMPHandler handler = new XMPHandler(); - parser.setContentHandler(handler); - parser.setErrorHandler(handler); - - // We have to parse twice. The first time, we may get - // an encoding change as part of an exception thrown. If this - // happens, we create a new InputSource with the encoding, and - // continue. - try { - parser.parse(src); - _xmpProp = src.makeProperty(); - } catch (SAXException se) { - String msg = se.getMessage(); - if (msg != null && msg.startsWith(ENCODING_PREFIX)) { - String encoding = msg.substring(5); - try { - src = new PdfXMPSource(metadata, getFile(), encoding); - parser.parse(src); - _xmpProp = src.makeProperty(); - } catch (UnsupportedEncodingException uee) { - _logger.log(Level.INFO, - "Attempt to use explicit encoding to parse XMP metadata failed.", - uee); - throw new PdfInvalidException( - MessageConstants.PDF_HUL_100); // PDF-HUL-100 - } - } - } - - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // Continue parsing if it's only invalid - return (e instanceof PdfInvalidException); - } catch (Exception e) { - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_101, // PDF-HUL-101 - _parser.getOffset())); - info.setValid(false); - return false; - } - return true; - } - - protected void findExternalStreams(RepInfo info) throws IOException { - _extStreamsList = new LinkedList(); - // stop processing if there is no root for the document tree - if (_docTreeRoot == null) - return; - _docTreeRoot.startWalk(); - try { - for (;;) { - // Get all the page objects in the document sequentially - PageObject page = _docTreeRoot.nextPageObject(); - if (page == null) { - break; - } - // Get the streams for the page and walk through them - List streams = page.getContentStreams(); - if (streams != null) { - ListIterator streamIter = streams.listIterator(); - while (streamIter.hasNext()) { - PdfStream stream = streamIter.next(); - String specStr = stream.getFileSpecification(); - if (specStr != null) { - Property prop = new Property(PROP_NAME_FILE, - PropertyType.STRING, specStr); - _extStreamsList.add(prop); - } - } - } - } - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage())); - } catch (Exception e) { - info.setWellFormed(false); - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_102.getMessage(), - e.getClass().getName()); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_102.getId(), mess); - info.setMessage(new ErrorMessage(message)); // PDF-HUL-102 - } - } - - /** - * Locates the filters in the content stream dictionaries - * and generate a list of unique pipelines. - * - * @return false if the filter structure is - * defective. - */ - protected boolean findFilters(RepInfo info) throws IOException { - _filtersList = new LinkedList(); - // stop processing if there is no root for the document tree - if (_docTreeRoot == null) - return false; - _docTreeRoot.startWalk(); - try { - for (;;) { - // Get all the page objects in the document sequentially - PageObject page = _docTreeRoot.nextPageObject(); - if (page == null) { - break; - } - // Get the streams for the page and walk through them - List streams = page.getContentStreams(); - if (streams != null) { - ListIterator streamIter = streams.listIterator(); - while (streamIter.hasNext()) { - PdfStream stream = streamIter.next(); - Filter[] filters = stream.getFilters(); - extractFilters(filters, stream); - } - } - } - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - // Continue parsing if it's only invalid - return (e instanceof PdfInvalidException); - } - return true; - } - - /** - * Finds the filters in a stream or array object which is the value - * of a stream's Filter key, and put them in _filtersList - * if a duplicate isn't there already. If the name is - * "Crypt", appends a colon and the name if available. - * Returns the filter string whether it's added or not, - * or null if there are no filters. - */ - protected String extractFilters(Filter[] filters, PdfStream stream) { - /* - * Concatenate the names into a string of names separated - * by spaces. - */ - int len = filters.length; - if (len == 0) { - return null; - } - StringBuffer buf = new StringBuffer(); - for (int i = 0; i < len; i++) { - Filter filt = filters[i]; - String fname = filt.getFilterName(); - buf.append(fname); - /* If it's a Crypt filter, add the crypt name. */ - if (FILTER_NAME_CRYPT.equals(fname)) { - String cname = filt.getNameParam(); - if (cname != null) { - buf.append(":" + cname); - } - } - if (i < len - 1) { - buf.append(' '); - } - } - String filterStr = buf.toString(); - boolean unique = true; - // Check for uniqueness. - Iterator iter = _filtersList.iterator(); - while (iter.hasNext()) { - Property p = iter.next(); - String s = (String) p.getValue(); - if (s.equals(filterStr)) { - unique = false; - break; - } - } - if (filterStr != null && unique) { - Property prop = new Property(PROP_NAME_FILTER_PIPELINE, - PropertyType.STRING, filterStr); - _filtersList.add(prop); - } - return filterStr; - } - - protected void findImages(RepInfo info) throws IOException { - _imagesList = new LinkedList(); - // needed if object streams are encrypted - if (_docTreeRoot == null) { - return; - } - _docTreeRoot.startWalk(); - try { - for (;;) { - // Get all the page objects in the document sequentially - PageObject page = _docTreeRoot.nextPageObject(); - if (page == null) { - break; - } - // Get the resources for the page and look for image XObjects - PdfDictionary rsrc = page.getResources(); - if (rsrc != null) { - PdfDictionary xo = (PdfDictionary) resolveIndirectObject( - rsrc.get(RESOURCE_NAME_XOBJECT)); - if (xo != null) { - Iterator iter = xo.iterator(); - while (iter.hasNext()) { - // Get an XObject and check if it's an image. - _logger.info("Getting image"); - PdfDictionary xobdict = null; - PdfObject xob = resolveIndirectObject(iter.next()); - if (xob instanceof PdfStream) { - xobdict = ((PdfStream) xob).getDict(); - } - if (xobdict != null) { - PdfSimpleObject subtype = (PdfSimpleObject) xobdict - .get(DICT_KEY_XOBJ_SUBTYPE); - if (XOBJ_SUBTYPE_IMAGE - .equals(subtype.getStringValue())) { - // It's an image XObject. Report stuff. - _logger.info("Image XObject"); - List imgList = new ArrayList( - 10); - Property prop = new Property( - PROP_NAME_IMAGE, - PropertyType.PROPERTY, - PropertyArity.LIST, imgList); - NisoImageMetadata niso = new NisoImageMetadata(); - imgList.add(new Property( - PROP_NAME_NISO_IMAGE_MD, - PropertyType.NISOIMAGEMETADATA, - niso)); - PdfObject widthBase = xobdict - .get(DICT_KEY_WIDTH); - PdfSimpleObject widObj = (PdfSimpleObject) resolveIndirectObject( - widthBase); - PdfObject heightBase = xobdict - .get(DICT_KEY_HEIGHT); - PdfSimpleObject htObj = (PdfSimpleObject) resolveIndirectObject( - heightBase); - if(widObj != null || htObj != null ) { - niso.setImageWidth(widObj.getIntValue()); - niso.setImageLength(htObj.getIntValue()); - } else { - info.setWellFormed(false); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_159.getId(), - MessageConstants.PDF_HUL_159.getMessage()); - info.setMessage(new ErrorMessage(message)); // PDF-HUL-159 - } - // Check for filters to add to the filter - // list - Filter[] filters = ((PdfStream) xob) - .getFilters(); - // Try to derive the image MIME type from - // filter names - String mimeType = imageMimeFromFilters( - filters); - niso.setMimeType(mimeType); - String filt = extractFilters(filters, - (PdfStream) xob); - if (filt != null) { - // If the filter is one which the NISO - // schema - // knows about, put it in the NISO - // metadata, - // otherwise put it in a Filter - // property. - int nisoFilt = nameToNiso(filt, - compressionStrings, - compressionValues); - if (nisoFilt >= 0) { - /* - * If it's 2, it's a CCITTFaxDecode - * filter. There may be an optional - * K entry that can change the - * value. - */ - PdfObject parms = xobdict.get( - DICT_KEY_DECODE_PARAMS); - if (parms != null) { - PdfSimpleObject kobj = null; - if (parms instanceof PdfDictionary) { - PdfDictionary pdict = (PdfDictionary) parms; - kobj = (PdfSimpleObject) resolveIndirectObject( - pdict.get(DICT_KEY_K)); - } - /* - * Note that the DecodeParms - * value may also be an array - * of dictionaries. We are not - * handling that contingency. - */ - if (kobj != null) { - int k = kobj.getIntValue(); - if (k < 0) { - nisoFilt = 4; - } else if (k > 0) { - nisoFilt = 3; - } - } - } - niso.setCompressionScheme(nisoFilt); - } else { - imgList.add(new Property( - PROP_NAME_FILTER, - PropertyType.STRING, filt)); - } - } else { - niso.setCompressionScheme(1); // no - // filter - } - - // Check for color space info - PdfObject colorSpc = xobdict - .get(DICT_KEY_COLOR_SPACE); - if (colorSpc != null) { - String colorName = null; - if (colorSpc instanceof PdfSimpleObject) { - colorName = ((PdfSimpleObject) colorSpc) - .getStringValue(); - } else if (colorSpc instanceof PdfArray) { - Vector vec = ((PdfArray) colorSpc) - .getContent(); - // Use the first element, which is - // the color space family - PdfSimpleObject fam = (PdfSimpleObject) vec - .elementAt(0); - colorName = fam.getStringValue(); - } - if (colorName != null) { - int nisoSpace = nameToNiso( - colorName, - colorSpaceStrings, - colorSpaceValues); - if (nisoSpace >= 0) { - niso.setColorSpace(nisoSpace); - } else { - imgList.add(new Property( - PROP_NAME_COLOR_SPACE, - PropertyType.STRING, - colorName)); - } - } - } - - PdfSimpleObject bpc = (PdfSimpleObject) xobdict - .get(DICT_KEY_BITS_PER_COMPONENT); - if (bpc != null) { - // imgList.add(new - // Property(DICT_KEY_BITS_PER_COMPONENT, - // PropertyType.INTEGER, - // new Integer (bpc.getIntValue()))); - niso.setBitsPerSample(new int[] { - bpc.getIntValue() }); - } - - PdfSimpleObject intent = (PdfSimpleObject) xobdict - .get(DICT_KEY_INTENT); - if (intent != null) { - imgList.add(new Property( - PROP_NAME_INTENT, - PropertyType.STRING, - intent.getStringValue())); - } - - PdfSimpleObject imgmsk = (PdfSimpleObject) xobdict - .get(DICT_KEY_IMAGE_MASK); - if (imgmsk != null) { - boolean b = imgmsk.isTrue(); - imgList.add(new Property( - PROP_NAME_IMAGE_MASK, - PropertyType.BOOLEAN, - Boolean.valueOf(b))); - } - - PdfArray dcd = (PdfArray) xobdict - .get(DICT_KEY_DECODE); - if (dcd != null) { - Vector dcdvec = dcd - .getContent(); - List dcdlst = new ArrayList( - dcdvec.size()); - Iterator diter = dcdvec - .iterator(); - while (diter.hasNext()) { - PdfSimpleObject d = (PdfSimpleObject) diter - .next(); - dcdlst.add(new Integer( - d.getIntValue())); - } - imgList.add(new Property( - PROP_NAME_DECODE, - PropertyType.INTEGER, - PropertyArity.LIST, dcdlst)); - } - - PdfSimpleObject intrp = (PdfSimpleObject) xobdict - .get(DICT_KEY_INTERPOLATE); - if (intrp != null) { - boolean b = intrp.isTrue(); - imgList.add(new Property( - PROP_NAME_INTERPOLATE, - PropertyType.BOOLEAN, - Boolean.valueOf(b))); - } - - PdfSimpleObject nam = (PdfSimpleObject) xobdict - .get(DICT_KEY_NAME); - if (nam != null) { - imgList.add(new Property(PROP_NAME_NAME, - PropertyType.STRING, - nam.getStringValue())); - } - - PdfSimpleObject id = (PdfSimpleObject) resolveIndirectObject( - xobdict.get(DICT_KEY_ID)); - if (id != null) { - String idstr = toHex( - id.getStringValue()); - imgList.add(new Property(PROP_NAME_ID, - PropertyType.STRING, idstr)); - } - - _imagesList.add(prop); - } - - } - } - } - } - } - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - } catch (Exception e) { - info.setWellFormed(false); - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_103.getMessage(), - e.getClass().getName()); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_103.getId(), mess); - info.setMessage(new ErrorMessage(message)); // PDF-HUL-103 - } - } - - /* - * Convert a Filter name to a NISO compression scheme value. - * If the name is unknown to NISO, return -1. - */ - protected int nameToNiso(String name, String[] nameArray, int[] valArray) { - for (int i = 0; i < nameArray.length; i++) { - if (nameArray[i].equals(name)) { - return valArray[i]; - } - } - return -1; // no match - } - - protected void findFonts(RepInfo info) throws IOException { - _type0FontsMap = new HashMap(); - _type1FontsMap = new HashMap(); - _trueTypeFontsMap = new HashMap(); - _mmFontsMap = new HashMap(); - _type3FontsMap = new HashMap(); - _cid0FontsMap = new HashMap(); - _cid2FontsMap = new HashMap(); - //needed if object streams are encrypted - if (_docTreeRoot == null) { - return; - } - try { - _docTreeRoot.startWalk(); - for (;;) { - // This time we need all the page objects and page tree - // nodes, because resources can be inherited from - // page tree nodes. - DocNode node = _docTreeRoot.nextDocNode(); - if (node == null) { - break; - } - // Get the fonts for the node - PdfDictionary fonts = null; - fonts = node.getFontResources(); - if (fonts != null) { - // In order to make sure we have a collection of - // unique fonts, we store them in a map keyed by - // object number. - Iterator fontIter = fonts.iterator(); - while (fontIter.hasNext()) { - PdfObject fontRef = fontIter.next(); - PdfObject font = resolveIndirectObject(fontRef); - if (font instanceof PdfDictionary) { - addFontToMap((PdfDictionary) font); - } else { - // Expected a dictionary - info.setWellFormed(false); - info.setMessage(new ErrorMessage( - MessageConstants.PDF_HUL_104, // PDF-HUL-104 - _parser.getOffset())); - return; - } - // If we've been directed appropriately, - // we accumulate the information, but don't - // report it. In that case, we post a message - // just once to that effect. - if (!_skippedFontsReported && !_showFonts - && _verbosity != Module.MAXIMUM_VERBOSITY) { - info.setMessage(new InfoMessage( - MessageConstants.PDF_HUL_105)); // PDF-HUL-105 - _skippedFontsReported = true; - } - } - } - } - } catch (PdfException e) { - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - return; - } catch (Exception e) { - // Unexpected exception. - _logger.log(Level.WARNING, - MessageConstants.PDF_HUL_106.getMessage(), e); - info.setWellFormed(false); - info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_106, // PDF-HUL-106 - e.toString(), _parser.getOffset())); - return; - } - } - - /** - * Add the font to the appropriate map, and return its subtype. - * If we've exceeded the maximum number of fonts, then ignore it. - */ - protected String addFontToMap(PdfDictionary font) { - if (++_nFonts > maxFonts) { - return null; - } - String subtypeStr = null; - try { - PdfSimpleObject subtype = (PdfSimpleObject) font - .get(DICT_KEY_FONT_SUBTYPE); - subtypeStr = subtype.getStringValue(); - if (FONT_TYPE0.equals(subtypeStr)) { - _type0FontsMap.put(new Integer(font.getObjNumber()), font); - // If the font is Type 0, we must go - // through its descendant fonts - PdfObject desc0 = font.get(DICT_KEY_DESCENDANT_FONTS); - PdfArray descendants = (PdfArray) resolveIndirectObject(desc0); - Vector subfonts = descendants.getContent(); - Iterator subfontIter = subfonts.iterator(); - while (subfontIter.hasNext()) { - PdfObject subfont = subfontIter.next(); - subfont = resolveIndirectObject(subfont); - addFontToMap((PdfDictionary) subfont); - } - } else if (FONT_TYPE1.equals(subtypeStr)) { - _type1FontsMap.put(new Integer(font.getObjNumber()), font); - } else if (FONT_MM_TYPE1.equals(subtypeStr)) { - _mmFontsMap.put(new Integer(font.getObjNumber()), font); - } else if (FONT_TYPE3.equals(subtypeStr)) { - _type3FontsMap.put(new Integer(font.getObjNumber()), font); - } else if (FONT_TRUE_TYPE.equals(subtypeStr)) { - _trueTypeFontsMap.put(new Integer(font.getObjNumber()), font); - } else if (FONT_CID_TYPE0.equals(subtypeStr)) { - _cid0FontsMap.put(new Integer(font.getObjNumber()), font); - } else if (FONT_CID_TYPE2.equals(subtypeStr)) { - _cid2FontsMap.put(new Integer(font.getObjNumber()), font); - } - return subtypeStr; - } catch (Exception e) { - return null; - } - } - - /****************************************************************** - * PRIVATE CLASS METHODS. - ******************************************************************/ - - protected static String toHex(String s) { - StringBuffer buffer = new StringBuffer("0x"); - - int len = s.length(); - for (int i = 0; i < len; i++) { - String h = Integer.toHexString(s.charAt(i)); - if (h.length() < 2) { - buffer.append("0"); - } - buffer.append(h); - } - - return buffer.toString(); - } - - protected static String toHex(Vector v) { - StringBuffer buffer = new StringBuffer("0x"); - - int len = v.size(); - for (int i = 0; i < len; i++) { - int hdigit = v.elementAt(i).intValue(); - String h = Integer.toHexString(hdigit); - if (h.length() < 2) { - buffer.append("0"); - } - buffer.append(h); - } - - return buffer.toString(); - } - - /** - * If the argument is an indirect object reference, - * returns the object it resolves to, otherwise returns - * the object itself. In particular, calling with null will - * return null. - */ - public PdfObject resolveIndirectObject(PdfObject obj) - throws PdfException, IOException { - if (obj instanceof PdfIndirectObj) { - int objIndex = ((PdfIndirectObj) obj).getObjNumber(); - /* - * Here we need to allow for the possibility that the - * object is compressed in an object stream. That means - * creating a new structure (call it _xref2) that contains - * the stream object number and offset whenever _xref[objIndex] - * is negative. _xref2 will have to contain the content - * stream object number (which will itself have to be - * resolved) and the offset into the object stream. - */ - return getObject(objIndex, MAX_OBJ_STREAM_DEPTH); - } - return obj; - } - - /** - * Returns an object of a given number. This may involve - * recursion into object streams, in which case it calls itself. - * - * @param objIndex - * The object number to look up - * @param recGuard - * The maximum permitted number of recursion levels; - * no particular value is required, but 30 or more - * should avoid false exceptions. - */ - protected PdfObject getObject(int objIndex, int recGuard) - throws PdfException, IOException { - /* Guard against infinite recursion */ - if (recGuard <= 0) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_107); - } - long offset = _xref[objIndex]; - if (offset == 0) { - return null; // This is considered legitimate by the spec - } - if (offset < 0) { - return getObjectFromStream(objIndex, recGuard); - } - _parser.seek(offset); - PdfObject obj = _parser.readObjectDef(this); - // - // Experimental carl@openpreservation.org 2018-03-14 - // - // Previously all object numbers (ids) were overwritten even if they'd - // previously been assigned. - // - // This is caused by a little confusion where the object ID and the - // index of the _xref array are used interchangeably when they're not - // the same thing. There's an assumption when for the _xref array - // that the objects will have continuous numeric object numbers. This - // means that the object number and array position will always be the - // same. The setting of the object number meant that the wrong object - // could - // be returned with the id changed to match the id requested. - // - // My guess is that the assignment was put in to ensure that an - // object that escaped initialisation had an object number. If that's - // the case then the code below will still allow that to happen but - // will prevent assigned numbers from been overwritten by the xref array - // position. - if (obj.getObjNumber() == -1) { - obj.setObjNumber(objIndex); - } - return obj; - } - - /** - * Return the RandomAccessFile being read. - */ - public RandomAccessFile getFile() { - return _raf; - } - - /** - * Returns the catalog dictionary object. - */ - public PdfDictionary getCatalogDict() { - return _docCatDict; - } - - /** - * Returns the trailer dictionary object. - */ - public PdfDictionary getTrailerDict() { - return _trailerDict; - } - - /** - * Returns the viewer preferences dictionary object. - */ - public PdfDictionary getViewPrefDict() { - return _viewPrefDict; - } - - /** - * Returns the outlines dictionary object. - */ - public PdfDictionary getOutlineDict() { - return _outlineDict; - } - - /** - * Get a font map. The map returned is determined by the selector. - * Any other value returns null. - */ - public Map getFontMap(int selector) { - switch (selector) { - case F_TYPE0: - return _type0FontsMap; - case F_TYPE1: - return _type1FontsMap; - case F_TT: - return _mmFontsMap; - case F_TYPE3: - return _type3FontsMap; - case F_MM1: - return _mmFontsMap; - case F_CID0: - return _cid0FontsMap; - case F_CID2: - return _cid2FontsMap; - default: - return null; - } - } - - /** - * Return a List of all the font maps. Together, these contain - * all the fonts and subfonts in the document. Some of the maps - * may be null. - */ - public List> getFontMaps() { - List> lst = new ArrayList>( - 7); - lst.add(_type0FontsMap); - lst.add(_type1FontsMap); - lst.add(_mmFontsMap); - lst.add(_type3FontsMap); - lst.add(_trueTypeFontsMap); - lst.add(_cid0FontsMap); - lst.add(_cid2FontsMap); - return lst; - } - - /** - * Returns a NameTreeNode for the EmbeddedFiles entry of the - * Names dictionary. Returns null if there isn't one. - */ - public NameTreeNode getEmbeddedFiles() { - return _embeddedFiles; - } - - /** - * Add the various font lists as a fonts property. Note: only add - * the "Fonts" property if there are, in fact, fonts defined. - */ - protected void addFontsProperty(List metadataList) { - List fontTypesList = new LinkedList(); - Property fontp = null; - if (_type0FontsMap != null && !_type0FontsMap.isEmpty()) { - try { - fontp = buildFontProperty(PROP_NAME_FONT_TYPE0, _type0FontsMap, - F_TYPE0); - fontTypesList.add(fontp); - } catch (ClassCastException e) { - // Report an error here? - } - } - if (_type1FontsMap != null && !_type1FontsMap.isEmpty()) { - try { - fontp = buildFontProperty(PROP_NAME_FONT_TYPE1, _type1FontsMap, - F_TYPE1); - fontTypesList.add(fontp); - } catch (ClassCastException e) { - // Report an error here? - } - } - if (_trueTypeFontsMap != null && !_trueTypeFontsMap.isEmpty()) { - try { - fontp = buildFontProperty(PROP_NAME_FONT_TRUE_TYPE, - _trueTypeFontsMap, F_TT); - fontTypesList.add(fontp); - } catch (ClassCastException e) { - // Report an error here? - } - } - if (_type3FontsMap != null && !_type3FontsMap.isEmpty()) { - try { - fontp = buildFontProperty(PROP_NAME_FONT_TYPE3, _type3FontsMap, - F_TYPE3); - fontTypesList.add(fontp); - } catch (ClassCastException e) { - } - } - if (_mmFontsMap != null && !_mmFontsMap.isEmpty()) { - try { - fontp = buildFontProperty(PROP_NAME_FONT_MM_TYPE1, _mmFontsMap, - F_MM1); - fontTypesList.add(fontp); - } catch (ClassCastException e) { - } - } - if (_cid0FontsMap != null && !_cid0FontsMap.isEmpty()) { - try { - fontp = buildFontProperty(PROP_NAME_FONT_CID_TYPE0, - _cid0FontsMap, F_CID0); - fontTypesList.add(fontp); - } catch (ClassCastException e) { - } - } - if (_cid2FontsMap != null && !_cid2FontsMap.isEmpty()) { - try { - fontp = buildFontProperty(PROP_NAME_FONT_CID_TYPE2, - _cid2FontsMap, F_CID2); - fontTypesList.add(fontp); - } catch (ClassCastException e) { - } - } - if (fontTypesList.size() > 0) { - metadataList.add(new Property(PROP_NAME_FONTS, - PropertyType.PROPERTY, PropertyArity.LIST, fontTypesList)); - } - } - - /* Build Pages property, with associated subproperties. */ - protected void addPagesProperty(List metadataList, RepInfo info) { - _pagesList = new LinkedList(); - _pageSeqMap = new HashMap(500); - // needed if object streams are encrypted - if (_docTreeRoot == null) { - return; - } - try { - _docTreeRoot.startWalk(); - int pageIndex = 0; - // Start the pipe with two entries. - // We always need to have the current and the next - // entry from the page label tree in order to determine - // the lower and upper bounds of the applicable range. - // If the first entry has a bound greater than zero, - // that appears to be an undefined situation, so we - // always treat the first entry as starting at zero. - if (_pageLabelRoot != null) { - if (!_pageLabelRoot.findNextKeyValue()) { - throw new PdfMalformedException( - MessageConstants.PDF_HUL_111); // PDF-HUL-111 - } - - _pageLabelRoot.findNextKeyValue(); - } - for (;;) { - // Get all the page objects in the document sequentially - // Have to do this in two passes so that link - // destinations can be properly reported. - PageObject page = _docTreeRoot.nextPageObject(); - if (page == null) { - break; - } - _pageSeqMap.put(new Integer(page.getDict().getObjNumber()), - new Integer(pageIndex + 1)); - } - _docTreeRoot.startWalk(); - for (;;) { - PageObject page = _docTreeRoot.nextPageObject(); - if (page == null) { - break; - } - Property p = buildPageProperty(page, pageIndex++, info); - _pagesList.add(p); - } - if (_showPages || _verbosity == Module.MAXIMUM_VERBOSITY) { - Property prop = new Property(PROP_NAME_PAGES, - PropertyType.PROPERTY, PropertyArity.LIST, _pagesList); - metadataList.add(prop); - } else { - if (!_skippedPagesReported) { - info.setMessage( - new InfoMessage(MessageConstants.PDF_HUL_112)); // PDF-HUL-112 - _skippedPagesReported = true; - } - } - } catch (PdfException e) { - - e.disparage(info); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - return; - } - } - - /* Build a subproperty for one PageObject. */ - protected Property buildPageProperty(PageObject page, int idx, RepInfo info) - throws PdfException { - List pagePropList = new ArrayList(4); - try { - // Foo on Java's inability to return values through - // parameters. Passing an array is a crock to achieve - // that effect. - int[] nominalNum = new int[1]; - Property plProp = buildPageLabelProperty(page, idx, nominalNum); - if (plProp != null) { - pagePropList.add(plProp); - } - if (plProp == null || nominalNum[0] != idx + 1) { - // Page sequence is different from label, or - // there is no label. Make it 1-based. - pagePropList.add(new Property(PROP_NAME_SEQUENCE, - PropertyType.INTEGER, new Integer(idx + 1))); - - } - } catch (PdfException e) { - throw e; - } catch (Exception f) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_113); // PDF-HUL-113 - } - - try { - List annotsList = new LinkedList(); - PdfArray annots = page.getAnnotations(); - if (annots != null) { - Vector contents = annots.getContent(); - for (int i = 0; i < contents.size(); i++) { - PdfObject annot = resolveIndirectObject( - contents.elementAt(i)); - if (annot instanceof PdfDictionary) { - annotsList.add(buildAnnotProperty((PdfDictionary) annot, - info)); - } else if (annot instanceof PdfSimpleObject - && ((PdfSimpleObject) annot).getToken() instanceof Comment) { - // ignore Comments - continue; - - } else { - // There are annotations which aren't dictionaries. I've - // run into this, - // but it violates the spec as far as I can tell. - throw new PdfInvalidException( - MessageConstants.PDF_HUL_114); // PDF-HUL-114 - } - } - if (!annotsList.isEmpty()) { - if (_showAnnotations - || _verbosity == Module.MAXIMUM_VERBOSITY) { - Property annotProp = new Property(PROP_NAME_ANNOTATIONS, - PropertyType.PROPERTY, PropertyArity.LIST, - annotsList); - pagePropList.add(annotProp); - } else { - // We don't report annotations if we got here, - // but we do report that we don't report them. - if (!_skippedAnnotationsReported) { - info.setMessage(new InfoMessage( - MessageConstants.PDF_HUL_115)); // PDF-HUL-115 - _skippedAnnotationsReported = true; - } - } - } - } - } catch (PdfException e) { - throw e; - } catch (Exception f) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_116); // PDF-HUL-116 - } - - try { - // Rotation property is inheritable - PdfObject tempObj = page.get(DICT_KEY_ROTATE, - true); - PdfSimpleObject rot = null; - if (tempObj != null && tempObj instanceof PdfSimpleObject) { - rot = (PdfSimpleObject) tempObj; - } else if (tempObj != null && tempObj instanceof PdfIndirectObj) { - rot = (PdfSimpleObject) ((PdfIndirectObj) tempObj) - .getObject(); - } - if (rot != null && rot.getIntValue() != 0) { - pagePropList.add(new Property(PROP_NAME_ROTATE, - PropertyType.INTEGER, new Integer(rot.getIntValue()))); - } - - // UserUnit property (1.6), not inheritable - PdfSimpleObject uu = (PdfSimpleObject) page.get(DICT_KEY_USER_UNIT, - false); - if (uu != null) { - pagePropList.add(new Property(PROP_NAME_USER_UNIT, - PropertyType.DOUBLE, new Double(rot.getDoubleValue()))); - } - // Viewport dictionaries (1.6), not inheritable - PdfArray vp = (PdfArray) page.get(DICT_KEY_VIEWPORT, false); - if (vp != null) { - Vector vpv = vp.getContent(); - Iterator iter = vpv.iterator(); - List vplist = new ArrayList(vpv.size()); - while (iter.hasNext()) { - PdfDictionary vpd = (PdfDictionary) resolveIndirectObject( - iter.next()); - PdfObject vpdbb = vpd.get(DICT_KEY_BBOX); - List vpPropList = new ArrayList(); - vpPropList.add(makeRectProperty( - (PdfArray) resolveIndirectObject(vpdbb), - DICT_KEY_BBOX)); - PdfObject meas = vpd.get(DICT_KEY_MEASURE); - if (meas instanceof PdfDictionary) { - vpPropList.add( - buildMeasureProperty((PdfDictionary) meas)); - // No, that's wrong -- the Viewport property itself - // needs to be a list with a bounding box. - } - vplist.add(new Property(PROP_NAME_VIEWPORT, - PropertyType.PROPERTY, PropertyArity.LIST, - vpPropList)); - } - pagePropList.add(new Property(PROP_NAME_VIEWPORTS, - PropertyType.PROPERTY, PropertyArity.LIST, vplist)); - } - // Thumbnail -- we just report if it's there. It's a - // non-inheritable property - PdfObject thumb = page.get(DICT_KEY_THUMB, false); - if (thumb != null) { - pagePropList.add(new Property(PROP_NAME_THUMB, - PropertyType.BOOLEAN, Boolean.TRUE)); - } - return new Property(PROP_NAME_PAGE, PropertyType.PROPERTY, - PropertyArity.LIST, pagePropList); - } catch (PdfException e) { - throw e; - } catch (Exception f) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_117); // PDF-HUL-117 - } - } - - /* - * Build a subproperty of a subproperty for page labels. - * The nomNumRef argument is a crock for returning the - * nominal number; element 0 of the array is replaced - * by the nominal number of the page. - */ - protected Property buildPageLabelProperty(PageObject page, int pageIndex, - int[] nomNumRef) throws PdfException { - if (_pageLabelRoot == null) { - return null; // no page label info - } - - // Note that our "current" page is the page label tree's - // "previous" key. Sorry about that... - int curFirstPage = _pageLabelRoot.getPrevKey(); - int nextFirstPage = _pageLabelRoot.getCurrentKey(); - try { - // If we're onto the next page range, advance our pointers. - if (pageIndex >= nextFirstPage) { - _pageLabelRoot.findNextKeyValue(); - curFirstPage = nextFirstPage; - } - PdfDictionary pageLabelDict = (PdfDictionary) resolveIndirectObject( - _pageLabelRoot.getPrevValue()); - StringBuffer labelText = new StringBuffer(); - PdfSimpleObject prefixObj = (PdfSimpleObject) pageLabelDict - .get(DICT_KEY_P); - if (prefixObj != null) { - labelText.append(prefixObj.getStringValue()); - } - PdfSimpleObject firstPageObj = (PdfSimpleObject) pageLabelDict - .get("St"); - // Sequence start value defaults to 1 if there's no start value - int firstPageVal = ((firstPageObj != null) - ? firstPageObj.getIntValue() - : 1); - int nominalPage = pageIndex - curFirstPage + firstPageVal; - if (nominalPage <= 0) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_118); // pDF-HUL-118 - } - nomNumRef[0] = nominalPage; - - // Get the numbering style. If there is no numbering - // style entry, the label consists only of the prefix. - PdfSimpleObject numStyleObj = (PdfSimpleObject) pageLabelDict - .get("S"); - String numStyle; - if (numStyleObj == null) { - numStyle = null; - } else { - numStyle = numStyleObj.getStringValue(); - } - if ("D".equals(numStyle)) { - // Nice, simple decimal numbers - labelText.append(nominalPage); - } else if ("R".equals(numStyle)) { - // Upper case roman numerals - labelText.append(PageLabelNode.intToRoman(nominalPage, true)); - } else if ("r".equals(numStyle)) { - // Lower case roman numerals - labelText.append(PageLabelNode.intToRoman(nominalPage, false)); - } else if ("A".equals(numStyle)) { - // Uppercase letters (A-Z, AA-ZZ, ...) - labelText.append(PageLabelNode.intToBase26(nominalPage, true)); - } else if ("a".equals(numStyle)) { - // Lowercase letters (a-z, aa-zz, ...) - labelText.append(PageLabelNode.intToBase26(nominalPage, false)); - } - // It screws up the PDF output if we have a blank Label property. - if (labelText.length() == 0) { - labelText.append(EMPTY_LABEL_PROPERTY); - } - return new Property(PROP_NAME_LABEL, PropertyType.STRING, - labelText.toString()); - } catch (Exception e) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_119); // PDF-HUL-119 - } - } - - /* Build a subproperty for a measure dictionary. */ - protected Property buildMeasureProperty(PdfDictionary meas) { - List plist = new ArrayList(); - PdfObject itemObj = meas.get(DICT_KEY_XOBJ_SUBTYPE); - if (itemObj instanceof PdfSimpleObject) { - plist.add(new Property(PROP_NAME_SUBTYPE, PropertyType.STRING, - ((PdfSimpleObject) itemObj).getStringValue())); - } - itemObj = meas.get(DICT_KEY_R); - if (itemObj instanceof PdfSimpleObject) { - plist.add(new Property(PROP_NAME_RATIO, PropertyType.STRING, - ((PdfSimpleObject) itemObj).getStringValue())); - } - // All kinds of stuff I could add -- limit it to the required - // X, Y, D and A arrays. - itemObj = meas.get("X"); - if (itemObj instanceof PdfArray) { - plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, "X")); - } - itemObj = meas.get("Y"); - if (itemObj instanceof PdfArray) { - plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, "Y")); - } - itemObj = meas.get("D"); - if (itemObj instanceof PdfArray) { - plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, PROP_NAME_DISTANCE)); - } - itemObj = meas.get("A"); - if (itemObj instanceof PdfArray) { - plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, PROP_NAME_AREA)); - } - return new Property(PROP_NAME_MEASURE, PropertyType.PROPERTY, - PropertyArity.LIST, plist); - } - - /* Build a subproperty for a number format array. */ - private Property buildNumberFormatArrayProperty(PdfArray arr, String propertyName) { - Vector v = arr.getContent(); - List alist = new ArrayList<>(); - for (int i = 0; i < v.size(); i++) { - PdfObject xobj = v.elementAt(i); - if (xobj instanceof PdfDictionary) { - PdfObject obj = ((PdfDictionary) xobj).get("U"); - if (obj instanceof PdfSimpleObject) { - alist.add(new Property("Name", PropertyType.DOUBLE, ((PdfSimpleObject) obj).getDoubleValue())); - } - obj = ((PdfDictionary) xobj).get("C"); - if (obj instanceof PdfSimpleObject) { - alist.add( - new Property("Coefficient", PropertyType.STRING, ((PdfSimpleObject) obj).getStringValue())); - } - } - } - return new Property(propertyName, PropertyType.PROPERTY, PropertyArity.LIST, alist); - } - - /* Build a subproperty of a subproperty for an annotation. */ - protected Property buildAnnotProperty(PdfDictionary annot, RepInfo info) - throws PdfException { - List propList = new ArrayList(7); - PdfObject itemObj; - try { - // Subtype is required - itemObj = annot.get(DICT_KEY_XOBJ_SUBTYPE); - propList.add(new Property(PROP_NAME_SUBTYPE, PropertyType.STRING, - ((PdfSimpleObject) itemObj).getStringValue())); - - // Contents is optional for some subtypes, required for - // others. We consider it optional here. - itemObj = annot.get(DICT_KEY_CONTENTS); - if (itemObj != null) { - propList.add( - new Property(PROP_NAME_CONTENTS, PropertyType.STRING, - _encrypted ? ENCRYPTED - : ((PdfSimpleObject) itemObj) - .getStringValue())); - } - - // Rectangle is required, and must be in the rectangle format - itemObj = annot.get(DICT_KEY_RECT); - propList.add(makeRectProperty( - (PdfArray) resolveIndirectObject(itemObj), PROP_NAME_RECT)); - - // Name comes from the NM entry and is optional - itemObj = annot.get("NM"); - if (itemObj != null) { - propList.add(new Property(DICT_KEY_NAME, PropertyType.STRING, - _encrypted ? ENCRYPTED - : ((PdfSimpleObject) itemObj).getStringValue())); - } - - // LastModified is optional. The documentation says that - // a PDF date is preferred but not guaranteed. We just - // put it out as a string. - itemObj = annot.get("M"); - if (itemObj != null) { - Literal lastModLit = (Literal) ((PdfSimpleObject) itemObj) - .getToken(); - Property dateProp; - dateProp = new Property(PROP_NAME_LAST_MOD, PropertyType.STRING, - _encrypted ? ENCRYPTED - : lastModLit.getValue()); - - propList.add(dateProp); - } - - // Flags. - itemObj = annot.get("F"); - if (itemObj != null) { - int flagValue = ((PdfSimpleObject) itemObj).getIntValue(); - Property flagProp = (buildBitmaskProperty(flagValue, - PROP_NAME_FLAGS, PdfStrings.ANNOTATIONFLAGS, - PROP_VAL_NO_FLAGS_SET)); - if (flagProp != null) { - propList.add(flagProp); - } - } - - // Appearance dictionary -- just check if it's there. - itemObj = annot.get("AP"); - if (itemObj != null) { - propList.add(new Property(PROP_NAME_APP_DICT, - PropertyType.BOOLEAN, Boolean.TRUE)); - } - - // Action dictionary -- if it's there, set actionsExist - itemObj = annot.get("A"); - if (itemObj != null) { - _actionsExist = true; - itemObj = resolveIndirectObject(itemObj); - // Actions are as common as Destinations for - // connecting to destination pages. If the Action - // is of type GoTo, note its destination. - PdfSimpleObject actionSubtype = (PdfSimpleObject) ((PdfDictionary) itemObj) - .get("S"); - if (actionSubtype == null) { - throw new PdfMalformedException( - MessageConstants.PDF_HUL_120); // PDF-HUL-120 - } - if (ACTION_VAL_GOTO.equals(actionSubtype.getStringValue())) { - PdfObject destObj = ((PdfDictionary) itemObj).get("D"); - if (destObj != null) { - addDestination(destObj, PROP_NAME_ACTION_DEST, propList, - info); - } - } - } - - // Destination object. - itemObj = annot.get(DICT_KEY_DEST); - if (itemObj != null) { - addDestination(itemObj, PROP_NAME_DESTINATION, propList, info); - } - - // Reply Type (RT) (1.6) - itemObj = annot.get("RT"); - if (itemObj instanceof PdfSimpleObject) { - String type = ((PdfSimpleObject) itemObj).getStringValue(); - propList.add(new Property(PROP_NAME_REPLY_TYPE, - PropertyType.STRING, type)); - } - - // Intent (IT) (1.6) - itemObj = annot.get("IT"); - if (itemObj instanceof PdfSimpleObject) { - String type = ((PdfSimpleObject) itemObj).getStringValue(); - propList.add(new Property(PROP_NAME_INTENT, PropertyType.STRING, - type)); - } - - // Callout Line (CL) (1.6) - itemObj = annot.get("CL"); - if (itemObj instanceof PdfArray) { - Vector clData = ((PdfArray) itemObj).getContent(); - // This should be an array of numbers. - Iterator iter = clData.iterator(); - List clList = new ArrayList(6); - while (iter.hasNext()) { - PdfSimpleObject clItem = (PdfSimpleObject) iter.next(); - clList.add(new Double(clItem.getDoubleValue())); - } - propList.add(new Property(PROP_NAME_CALLOUT_LINE, - PropertyType.DOUBLE, PropertyArity.LIST, clList)); - } - - return new Property(PROP_NAME_ANNOTATION, PropertyType.PROPERTY, - PropertyArity.LIST, propList); - } catch (PdfException ee) { - // Just rethrow these - throw ee; - } catch (Exception e) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_121); // PDF-HUL-121 - } - } - - /* - * Given a PdfObject that stands for a Destination, add - * a representative property to the property list. - */ - protected void addDestination(PdfObject itemObj, String propName, - List propList, RepInfo info) { - try { - Destination dest = new Destination(itemObj, this, false); - if (dest.isIndirect()) { - // Encryption messes up name trees - if (!_encrypted) { - int pageObjNum = resolveIndirectDest( - dest.getIndirectDest(), info); - if (pageObjNum == -1) { - // The scope of the reference is outside this - // file, so we just report it as such. - propList.add(new Property(propName, PropertyType.STRING, - PROP_VAL_EXTERNAL)); - } else { - propList.add(new Property(propName, - PropertyType.INTEGER, new Integer(pageObjNum))); - } - } - } else { - if (dest.getPageDest() == null) { - return; // can't get the page object number - } - int pageObjNum = dest.getPageDestObjNumber(); - Integer destPg = _pageSeqMap.get(new Integer(pageObjNum)); - if (destPg != null) { - propList.add(new Property(propName, PropertyType.INTEGER, - destPg)); - } - } - } catch (PdfMalformedException e) { - propList.add(new Property(propName, PropertyType.STRING, PROP_VAL_NULL)); - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - info.setValid(false); - } catch (PdfInvalidException e) { - if (e.getJhoveMessage() != null) { - info.setMessage(new ErrorMessage( - JhoveMessages.getMessageInstance( - e.getJhoveMessage().getId(), e.getJhoveMessage().getMessage(),e.getJhoveMessage().getSubMessage() ))); - } - } catch (Exception e) { - - String msg = e.getClass().getName(); - String msg1 = e.getMessage(); - if (msg1 != null) { - msg = msg + ": " + msg1; - } - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_122.getId(), msg); - propList.add( - new Property(propName, PropertyType.STRING, PROP_VAL_NULL)); - info.setMessage(new ErrorMessage(message, // PDF-HUL-122 - _parser.getOffset())); - info.setValid(false); - } - } - - /* - * Build up a property for one of the kinds of fonts - * in the file. - */ - protected Property buildFontProperty(String name, Map map, int fontType) { - List fontList = new LinkedList(); // list of fonts - Iterator fontIter = map.values().iterator(); - while (fontIter.hasNext()) { - // For each font in the map, build a property for it, - // which consists of a list of scalar properties. Each kind - // of font is spec'ed to have a slightly different set of - // properties, grumble... - PdfDictionary dict = (PdfDictionary) fontIter.next(); - List fontPropList = oneFontPropList(dict, fontType); - Property fProp = new Property(PROP_NAME_FONT, PropertyType.PROPERTY, - PropertyArity.LIST, fontPropList); - fontList.add(fProp); - } - return new Property(name, PropertyType.PROPERTY, PropertyArity.LIST, - fontList); - } - - /* Build the Property list for a given font */ - protected List oneFontPropList(PdfDictionary dict, int fontType) { - List fontPropList = new LinkedList(); - Property prop; - if (fontType == F_TYPE1 || fontType == F_TYPE3 || fontType == F_MM1 - || fontType == F_TT) { - PdfObject tempObj = dict.get(DICT_KEY_NAME); - PdfSimpleObject nameObj = null; - if (tempObj instanceof PdfSimpleObject) { - nameObj = (PdfSimpleObject) tempObj; - } else if (tempObj instanceof PdfIndirectObj) { - nameObj = (PdfSimpleObject) ((PdfIndirectObj) tempObj) - .getObject(); - } - - if (nameObj != null) { - String nameStr = nameObj.getStringValue(); - prop = new Property(DICT_KEY_NAME, PropertyType.STRING, - nameStr); - fontPropList.add(prop); - } - } - - String baseStr = null; - if (fontType != F_TYPE3) { - PdfObject tempObj = dict.get(DICT_KEY_BASE_FONT); - PdfSimpleObject baseFontObj = null; - if (tempObj instanceof PdfSimpleObject) { - baseFontObj = (PdfSimpleObject) tempObj; - } else if (tempObj instanceof PdfIndirectObj) { - baseFontObj = (PdfSimpleObject) ((PdfIndirectObj) tempObj) - .getObject(); - } - - if (baseFontObj != null) { - baseStr = baseFontObj.getStringValue(); - prop = new Property(PROP_NAME_BASE_FONT, PropertyType.STRING, - baseStr); - fontPropList.add(prop); - } - } - - if (fontType == F_CID0 || fontType == F_CID2) { - PdfObject elCid = dict.get(DICT_KEY_CID_INFO); - try { - elCid = resolveIndirectObject(elCid); - } catch (Exception e) { - } - if (elCid instanceof PdfDictionary) { - prop = buildCIDInfoProperty((PdfDictionary) elCid); - fontPropList.add(prop); - } - } - - if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1) { - if (isFontSubset(baseStr)) { - prop = new Property(PROP_NAME_FONT_SUBSET, PropertyType.BOOLEAN, - Boolean.TRUE); - fontPropList.add(prop); - } - } - - if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1 - || fontType == F_TYPE3) { - PdfObject firstCharObj = dict.get(DICT_KEY_FIRST_CHAR); - if (firstCharObj instanceof PdfIndirectObj) { - firstCharObj = ((PdfIndirectObj) firstCharObj).getObject(); - } - try { - int firstChar = ((PdfSimpleObject) firstCharObj).getIntValue(); - prop = new Property(PROP_NAME_FIRST_CHAR, PropertyType.INTEGER, - new Integer(firstChar)); - fontPropList.add(prop); - } catch (Exception e) { - } - - PdfObject lastCharObj = dict.get(DICT_KEY_LAST_CHAR); - if (lastCharObj instanceof PdfIndirectObj) { - lastCharObj = ((PdfIndirectObj) lastCharObj).getObject(); - } - try { - int lastChar = ((PdfSimpleObject) lastCharObj).getIntValue(); - prop = new Property(PROP_NAME_LAST_CHAR, PropertyType.INTEGER, - new Integer(lastChar)); - fontPropList.add(prop); - } catch (Exception e) { - } - } - - if (fontType == F_TYPE3) { - // Put FontBBox and CharProcs into properties - PdfObject bboxObj = dict.get(DICT_KEY_FONT_BBOX); - try { - if (bboxObj instanceof PdfArray) { - fontPropList.add(makeRectProperty((PdfArray) bboxObj, - PROP_VAL_FONT_BBOX)); - } - } catch (Exception e) { - } - - // For CharProcs, we're just checking if it's there. - // (It's required for a Type 3 font.) - // PdfObject charProcs = dict.get("CharProcs"); - // prop = new Property("CharProcs", - // PropertyType.BOOLEAN, - // Boolean.valueOf(charProcs != null)); - // fontPropList.add(prop); - } - - if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1 - || fontType == F_CID0 || fontType == F_CID2) { - PdfObject descriptorObj = dict.get(DICT_KEY_FONT_DESCRIPTOR); - try { - descriptorObj = resolveIndirectObject(descriptorObj); - } catch (Exception e) { - } - if (descriptorObj instanceof PdfDictionary) { - prop = buildFontDescriptorProperty( - (PdfDictionary) descriptorObj); - fontPropList.add(prop); - } - } - - PdfObject encodingObj = dict.get(DICT_KEY_ENCODING); - try { - encodingObj = resolveIndirectObject(encodingObj); - } catch (Exception e) { - } - - if (fontType == F_TYPE0 || fontType == F_TYPE1 || fontType == F_TT - || fontType == F_MM1 || fontType == F_TYPE3) { - // Encoding property -- but only if Encoding is a name - if (encodingObj instanceof PdfSimpleObject) { - prop = new Property(PROP_NAME_ENCODING, PropertyType.STRING, - ((PdfSimpleObject) encodingObj).getStringValue()); - fontPropList.add(prop); - } - } - - if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1 - || fontType == F_TYPE3) { - if (encodingObj != null && encodingObj instanceof PdfDictionary) { - prop = buildEncodingDictProperty((PdfDictionary) encodingObj); - fontPropList.add(prop); - } - } - - if (fontType == F_TYPE0) { - // Encoding is reported as a CMapDictionary property for type 0 - if (encodingObj != null && encodingObj instanceof PdfStream) { - prop = buildCMapDictProperty((PdfStream) encodingObj); - fontPropList.add(prop); - } - } - - if (fontType == F_TYPE3) { - // All we're interested in for Resources is whether - // the dictionary exists - PdfObject rsrc = dict.get(DICT_KEY_RESOURCES); - if (rsrc != null) { - prop = new Property(PROP_NAME_RESOURCES, PropertyType.BOOLEAN, - Boolean.TRUE); - fontPropList.add(prop); - } - } - - if (fontType == F_TYPE0 || fontType == F_TYPE1 || fontType == F_TT - || fontType == F_MM1 || fontType == F_TYPE3) { - PdfObject toUniObj = dict.get(DICT_KEY_TO_UNICODE); - if (toUniObj != null) { - prop = new Property(PROP_NAME_TO_UNICODE, PropertyType.BOOLEAN, - Boolean.TRUE); - fontPropList.add(prop); - } - } - - return fontPropList; - } - - /* - * Code for CMapProperty for Type 0 fonts, based on the Encoding - * entry, broken out of buildFontProperty. - */ - protected Property buildCMapDictProperty(PdfStream encoding) { - PdfDictionary dict = encoding.getDict(); - List propList = new ArrayList(4); - Property prop = new Property(PROP_NAME_CMAP_DICT, PropertyType.PROPERTY, - PropertyArity.LIST, propList); - Property subprop; - - // PdfObject mapName = dict.get ("CMapName"); - - PdfObject cidSysInfo = dict.get(DICT_KEY_CID_INFO); - // We can use buildCIDInfoProperty here to build the subproperty - PdfDictionary cidDict; - List cidList = new LinkedList(); - try { - if (cidSysInfo instanceof PdfDictionary) { - // One CIDInfo dictionary - cidDict = (PdfDictionary) cidSysInfo; - subprop = buildCIDInfoProperty(cidDict); - cidList.add(subprop); - } else if (cidSysInfo instanceof PdfArray) { - // Many CIDInfo dictionaries - Vector v = ((PdfArray) cidSysInfo).getContent(); - for (int i = 0; i < v.size(); i++) { - cidDict = (PdfDictionary) v.elementAt(i); - Property subsubprop = buildCIDInfoProperty(cidDict); - cidList.add(subsubprop); - } - } - } catch (Exception e) { - } - - if (!cidList.isEmpty()) { - subprop = new Property(PROP_NAME_CID_INFOS, PropertyType.PROPERTY, - PropertyArity.LIST, cidList); - propList.add(subprop); - } - - // PdfObject wMod = dict.get("WMode"); - // PdfObject useCMap = dict.get("UseCMap"); - - return prop; - } - - /* - * Code for CIDInfoProperty for CIDFontType0 and CIDFontType2 - * conts. - */ - protected Property buildCIDInfoProperty(PdfDictionary dict) { - List propList = new ArrayList(3); - Property prop = new Property(PROP_NAME_CID_INFO, PropertyType.PROPERTY, - PropertyArity.LIST, propList); - Property subprop; - - // Add the registry identifier - PdfObject reg = dict.get(DICT_KEY_REGISTRY); - if (reg instanceof PdfSimpleObject) { - try { - String regText = ((PdfSimpleObject) reg).getStringValue(); - subprop = new Property(PROP_NAME_REGISTRY, PropertyType.STRING, - _encrypted ? ENCRYPTED : regText); - propList.add(subprop); - } catch (Exception e) { - } - } - - // Add the name of the char collection within the registry - PdfObject order = dict.get(DICT_KEY_ORDERING); - if (reg instanceof PdfSimpleObject) { - try { - String ordText = ((PdfSimpleObject) order).getStringValue(); - subprop = new Property(PROP_NAME_REGISTRY, PropertyType.STRING, - ordText); - propList.add(subprop); - } catch (Exception e) { - } - } - - PdfObject supp = dict.get(DICT_KEY_SUPPLEMENT); - if (supp instanceof PdfSimpleObject) { - try { - int suppvalue = ((PdfSimpleObject) supp).getIntValue(); - subprop = new Property(PROP_NAME_SUPPLEMENT, - PropertyType.INTEGER, new Integer(suppvalue)); - propList.add(subprop); - } catch (Exception e) { - } - } - return prop; - } - - /* - * Code for EncodingDictionary Property for type 1, 3, TrueType, and - * MM fonts. This is based on a dictionary entry with the same name - * as the one for buildCMapDictProperty, but different information. - * Included properties are BaseEncoding and Differences. - */ - protected Property buildEncodingDictProperty(PdfDictionary encodingDict) { - List propList = new ArrayList(2); - Property prop = new Property(PROP_NAME_ENCODING_DICTIONARY, - PropertyType.PROPERTY, PropertyArity.LIST, propList); - PdfObject baseEnc = encodingDict.get(DICT_KEY_BASE_ENCODING); - if (baseEnc instanceof PdfSimpleObject) { - String baseEncString = ((PdfSimpleObject) baseEnc).getStringValue(); - if (baseEncString != null) { - Property baseEncProp = new Property(PROP_NAME_BASE_ENCODING, - PropertyType.STRING, baseEncString); - propList.add(baseEncProp); - } - } - - PdfObject diffs = encodingDict.get(DICT_KEY_DIFFERENCES); - Property diffsProp = new Property(PROP_NAME_DIFFERENCES, - PropertyType.BOOLEAN, Boolean.valueOf(diffs != null)); - propList.add(diffsProp); - - return prop; - } - - /* - * Separated-out code for FontDescriptor property. This - * is a list of six Properies: FontName, Flags, - * FontBBox, FontFile, FontFile2, and FontFile3. - */ - protected Property buildFontDescriptorProperty(PdfDictionary encodingDict) { - List propList = new ArrayList(6); - Property prop = new Property(PROP_NAME_FONT_DESC, PropertyType.PROPERTY, - PropertyArity.LIST, propList); - Property subprop; - try { - PdfSimpleObject fName = (PdfSimpleObject) encodingDict - .get(DICT_KEY_FONT_NAME); - String fNameStr = fName.getStringValue(); - subprop = new Property(PROP_NAME_FONT_NAME, PropertyType.STRING, - fNameStr); - propList.add(subprop); - } catch (Exception e) { - } - - try { - PdfSimpleObject flags = (PdfSimpleObject) encodingDict - .get(DICT_KEY_FLAGS); - int flagValue = flags.getIntValue(); - subprop = buildBitmaskProperty(flagValue, PROP_NAME_FLAGS, - PdfStrings.FONTDESCFLAGS, PROP_VAL_NO_FLAGS_SET); - if (subprop != null) { - propList.add(subprop); - } - } catch (Exception e) { - } - - try { - PdfArray bboxObj = (PdfArray) encodingDict.get(DICT_KEY_FONT_BBOX); - double[] bbox = bboxObj.toRectangle(); - // toRectangle is written to return an array of double, - // which is what the bounding box is in the most general - // case; but the spec requires an array of integer, so - // we convert is. This may seem like an excess of work, - // but I'd rather have toRectangle do the right thing - // rather than losing generality. - if (bbox != null) { - int[] ibbox = new int[4]; - for (int i = 0; i < 4; i++) { - ibbox[i] = (int) bbox[i]; - } - subprop = new Property(PROP_NAME_FONT_BBOX, - PropertyType.INTEGER, PropertyArity.ARRAY, ibbox); - propList.add(subprop); - } - } catch (Exception e) { - } - - PdfObject fontFile = encodingDict.get(DICT_KEY_FONT_FILE); - if (fontFile != null) { - // All we care about is whether it exists or not - subprop = new Property(PROP_NAME_FONT_FILE, PropertyType.BOOLEAN, - Boolean.TRUE); - propList.add(subprop); - } - fontFile = encodingDict.get(DICT_KEY_FONT_FILE_2); - if (fontFile != null) { - subprop = new Property(PROP_NAME_FONT_FILE_2, PropertyType.BOOLEAN, - Boolean.TRUE); - propList.add(subprop); - } - fontFile = encodingDict.get(DICT_KEY_FONT_FILE_3); - if (fontFile != null) { - subprop = new Property(PROP_NAME_FONT_FILE_3, PropertyType.BOOLEAN, - Boolean.TRUE); - propList.add(subprop); - } - return prop; - } - - protected Property buildViewPrefProperty(PdfDictionary prefDict) { - Property p; - PdfObject ob; - boolean b; - String s; - List propList = new ArrayList(12); - Property prop = new Property(DICT_KEY_VIEWER_PREFS, - PropertyType.PROPERTY, PropertyArity.LIST, propList); - - ob = prefDict.get(DICT_KEY_HIDE_TOOLBAR); - if (ob instanceof PdfSimpleObject) { - b = ((PdfSimpleObject) ob).isTrue(); - } else { - b = false; - } - p = new Property(PROP_NAME_HIDE_TOOLBAR, PropertyType.BOOLEAN, - Boolean.valueOf(b)); - propList.add(p); - - ob = prefDict.get(DICT_KEY_HIDE_MENUBAR); - if (ob instanceof PdfSimpleObject) { - b = ((PdfSimpleObject) ob).isTrue(); - } else { - b = false; - } - p = new Property(PROP_NAME_HIDE_MENUBAR, PropertyType.BOOLEAN, - Boolean.valueOf(b)); - propList.add(p); - - ob = prefDict.get(DICT_KEY_HIDE_WINDOW_UI); - if (ob instanceof PdfSimpleObject) { - b = ((PdfSimpleObject) ob).isTrue(); - } else { - b = false; - } - p = new Property(PROP_NAME_HIDE_WINDOW_UI, PropertyType.BOOLEAN, - Boolean.valueOf(b)); - propList.add(p); - - ob = prefDict.get(DICT_KEY_FIT_WINDOW); - if (ob instanceof PdfSimpleObject) { - b = ((PdfSimpleObject) ob).isTrue(); - } else { - b = false; - } - p = new Property(PROP_NAME_FIT_WINDOW, PropertyType.BOOLEAN, - Boolean.valueOf(b)); - propList.add(p); - - ob = prefDict.get(DICT_KEY_CENTER_WINDOW); - if (ob instanceof PdfSimpleObject) { - b = ((PdfSimpleObject) ob).isTrue(); - } else { - b = false; - } - p = new Property(PROP_NAME_CENTER_WINDOW, PropertyType.BOOLEAN, - Boolean.valueOf(b)); - propList.add(p); - - ob = prefDict.get(DICT_KEY_DISP_DOC_TITLE); - if (ob instanceof PdfSimpleObject) { - b = ((PdfSimpleObject) ob).isTrue(); - } else { - b = false; - } - p = new Property(PROP_NAME_DISP_DOC_TITLE, PropertyType.BOOLEAN, - Boolean.valueOf(b)); - propList.add(p); - - ob = prefDict.get(DICT_KEY_NO_FULL_PAGE); - if (ob instanceof PdfSimpleObject) { - s = ((PdfSimpleObject) ob).getStringValue(); - } else - s = DEFAULT_MODE; - p = new Property(PROP_NAME_NO_FULL_PAGE, PropertyType.STRING, s); - propList.add(p); - - ob = prefDict.get(DICT_KEY_DIRECTION); - if (ob instanceof PdfSimpleObject) { - s = ((PdfSimpleObject) ob).getStringValue(); - } else - s = "L2R"; - p = new Property(PROP_NAME_DIRECTION, PropertyType.STRING, s); - propList.add(p); - - ob = prefDict.get(DICT_KEY_VIEW_AREA); - if (ob instanceof PdfSimpleObject) { - s = ((PdfSimpleObject) ob).getStringValue(); - } else - s = PROP_VAL_CROP_BOX; - p = new Property(PROP_NAME_VIEW_AREA, PropertyType.STRING, s); - propList.add(p); - - ob = prefDict.get(DICT_KEY_VIEW_CLIP); - if (ob instanceof PdfSimpleObject) { - s = ((PdfSimpleObject) ob).getStringValue(); - } else - s = PROP_VAL_CROP_BOX; - p = new Property(PROP_NAME_VIEW_CLIP, PropertyType.STRING, s); - propList.add(p); - - ob = prefDict.get(DICT_KEY_PRINT_AREA); - if (ob instanceof PdfSimpleObject) { - s = ((PdfSimpleObject) ob).getStringValue(); - } else - s = PROP_VAL_CROP_BOX; - p = new Property(PROP_NAME_PRINT_AREA, PropertyType.STRING, s); - propList.add(p); - - ob = prefDict.get(DICT_KEY_PAGE_CLIP); - if (ob instanceof PdfSimpleObject) { - s = ((PdfSimpleObject) ob).getStringValue(); - } else - s = PROP_VAL_CROP_BOX; - p = new Property(PROP_NAME_PAGE_CLIP, PropertyType.STRING, s); - propList.add(p); - return prop; - } - - /* - * Return TRUE if the string is a font subset string, which begins - * with six uppercase letters and then a plus sign - */ - protected boolean isFontSubset(String baseStr) { - if (baseStr == null || baseStr.length() < 7) { - return false; - } - for (int i = 0; i < 6; i++) { - char ch = baseStr.charAt(i); - if (!Character.isUpperCase(ch)) { - return false; - } - } - return (baseStr.charAt(6) == '+'); - } - - /* - * Create the "Outlines" property from the Outlines item in the - * catalog dictionary. As a side effect, we set the actionsExist - * flag if any Actions are found. Because we check destinations, - * this can't be called till the page tree is built. - * - * Outlines can be recursive, according to Adobe people, so we have - * to track visited nodes. - */ - protected Property buildOutlinesProperty(PdfDictionary dict, RepInfo info) - throws PdfException { - _recursionWarned = false; - _visitedOutlineNodes = new HashSet(); - List itemList = new LinkedList(); - Property prop = new Property(PROP_NAME_OUTLINES, PropertyType.PROPERTY, - PropertyArity.LIST, itemList); - try { - PdfObject item = resolveIndirectObject(dict.get(DICT_KEY_FIRST)); - // In PDF 1.4, "First" and "Last" are unconditionally required. - // However, - // in 1.6, they can be omitted if there are no open or closed - // outline items. - // Strictly speaking, we should do several additional checks, but - // letting the - // outline go as empty seems sufficient. - // if (item == null || !(item instanceof PdfDictionary)) { - // throw new PdfInvalidException("Outline dictionary missing - // required entry"); - // } - int listCount = 0; // Guard against looping - while (item != null) { - Integer onum = new Integer(item.getObjNumber()); - Property p = buildOutlineItemProperty((PdfDictionary) item, - info); - itemList.add(p); - item = resolveIndirectObject( - ((PdfDictionary) item).get(DICT_KEY_NEXT)); - if (item == null) { - break; - } - // Check if this object is its own sibling. (It really does - // happen!) - if (item.getObjNumber() == onum.intValue()) { - if (!_recursionWarned) { - info.setMessage( - new InfoMessage(MessageConstants.PDF_HUL_123)); // PDF-HUL-123 - _recursionWarned = true; - } - break; - } - if (++listCount > 2000) { - break; - } - } - } catch (PdfException e1) { - throw e1; - } catch (Exception e) { - throw new PdfMalformedException(MessageConstants.PDF_HUL_124); // PDF-HUL-124 - } - if (itemList.isEmpty()) { - return null; - } - return prop; - } - - /* - * Create an item property within the outlines hierarchy. If an - * Outline item property has children, then there is a list - * property called "Children" with elements called "Item". - * It calls itself recursively to walk down the outline. - */ - protected Property buildOutlineItemProperty(PdfDictionary dict, - RepInfo info) throws PdfException { - List itemList = new ArrayList(3); - try { - Property prop = new Property(PROP_NAME_ITEM, PropertyType.PROPERTY, - PropertyArity.LIST, itemList); - PdfSimpleObject title = (PdfSimpleObject) resolveIndirectObject( - dict.get(DICT_KEY_TITLE)); - if (title == null) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_125); // PDF-HUL-125 - } - itemList.add(new Property(PROP_NAME_TITLE, PropertyType.STRING, - _encrypted ? ENCRYPTED : title.getStringValue())); - - // Check other required stuff - if (dict.get(DICT_KEY_PARENT) == null) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_126); // PDF-HUL-126 - } - PdfObject cnt = dict.get(DICT_KEY_COUNT); - if (cnt != null && (!(cnt instanceof PdfSimpleObject) - || !(((PdfSimpleObject) cnt) - .getToken() instanceof Numeric))) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_127); // PDF-HUL-127 - } - // The entries for Prev, Next, First, and Last must - // all be indirect references or absent. Just cast them to - // throw an exception if they're something else - @SuppressWarnings("unused") - PdfIndirectObj ob = (PdfIndirectObj) dict.get(DICT_KEY_PREV); - ob = (PdfIndirectObj) dict.get(DICT_KEY_NEXT); - ob = (PdfIndirectObj) dict.get(DICT_KEY_FIRST); - ob = (PdfIndirectObj) dict.get(DICT_KEY_LAST); - - // Check if there are Actions in the outline. This saves going - // through the outlines all over again if a Profile checker - // needs to know this. We flag only the existence of one or more - // Actions - // in the document. - if (dict.get("A") != null) { - _actionsExist = true; - } - - PdfObject destObj = dict.get(DICT_KEY_DEST); - if (destObj != null) { - destObj = resolveIndirectObject(destObj); - Destination dest = new Destination(destObj, this, false); - if (dest.isIndirect()) { - itemList.add(new Property(PROP_NAME_DESTINATION, - PropertyType.STRING, dest.getIndirectDest().getStringValue())); - } else { - int pageObjNum = dest.getPageDestObjNumber(); - Integer destPg = _pageSeqMap.get(new Integer(pageObjNum)); - if (destPg != null) { - itemList.add(new Property(PROP_NAME_DESTINATION, - PropertyType.INTEGER, destPg)); - } - } - } - - PdfDictionary child = (PdfDictionary) resolveIndirectObject( - dict.get(DICT_KEY_FIRST)); - if (child != null) { - List childList = new LinkedList(); - Property childProp = new Property(PROP_NAME_CHILDREN, - PropertyType.PROPERTY, PropertyArity.LIST, childList); - // We aren't catching all possible combinations of looping. Put - // a maximum - // on the list just to be safe. - int listCount = 0; - while (child != null) { - Integer onum = new Integer(child.getObjNumber()); - if (_visitedOutlineNodes.contains(onum)) { - /* We have recursion! */ - if (!_recursionWarned) { - // Warn of recursion - info.setMessage(new InfoMessage( - MessageConstants.PDF_HUL_128)); // PDF-HUL-128 - _recursionWarned = true; - } - } else { - _visitedOutlineNodes.add(onum); - Property p = buildOutlineItemProperty(child, info); - childList.add(p); - } - child = (PdfDictionary) resolveIndirectObject( - child.get(DICT_KEY_NEXT)); - if (child == null) { - break; - } - // Check if this object is its own sibling. (It really does - // happen!) - if (child.getObjNumber() == onum.intValue()) { - if (!_recursionWarned) { - info.setMessage(new InfoMessage( - MessageConstants.PDF_HUL_129)); // PDF-HUL-129 - _recursionWarned = true; - } - break; - } - if (++listCount > 2000) - break; // safety check - } - itemList.add(childProp); - } - return prop; - } catch (PdfException pe) { - throw pe; - } catch (ClassCastException ce) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_130); // PDF-HUL-130 - } catch (Exception e) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_131); // PDF-HUL-131 - } - } - - /* - * This is separated out from readDocCatalogDict, where it - * would otherwise make sense, because we can't build - * the outlines property till we have a page tree to - * locate destinations. - */ - protected boolean doOutlineStuff(RepInfo info) { - if (_outlineDict != null) { - try { - Property oprop = buildOutlinesProperty(_outlineDict, info); - if (_showOutlines || _verbosity == Module.MAXIMUM_VERBOSITY) { - if (oprop != null) { - _docCatalogList.add(oprop); - } - } else if (!_skippedOutlinesReported) { - // We report that we aren't reporting skipped outlines - info.setMessage( - new InfoMessage(MessageConstants.PDF_HUL_132)); // PDF-HUL-132 - _skippedOutlinesReported = true; - } - } catch (PdfException e) { - info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); - e.disparage(info); - // If it's just invalid, we can keep going - return (e instanceof PdfInvalidException); - } - } - return true; - } - - /* - * Given a PdfSimpleObject representing a key, - * look up the Destination which it references. - * There are two completely different ways this can be done, - * though any given PDF file is supposed to implement only one. - * If _destsDict is non-null, we look the string up there, and - * may find either a dictionary or an array. Otherwise - * if _destNames is non-null, it's a NameTreeNode which contains - * the mapping. In either case, the destination could be - * external, in which case we just return a string saying so. - * (The implementation of Destinations in PDF is a prime example - * of design by stone soup.) - * We return the page sequence number for the referenced page. - * If we can't find a match for the reference, we return -1. - */ - protected int resolveIndirectDest(PdfSimpleObject key, RepInfo info) throws PdfException { - if (key == null) { - throw new IllegalArgumentException("Argument key can not be null"); - } - _logger.finest("Looking for indirectly referenced Dest: " - + key.getStringValue()); - if (_destNames == null) - return -1; - PdfObject destObj = _destNames.get(key.getRawBytes()); - // Was the Dest this annotation refers to found in the document? - if (destObj == null) { - // Treat this condition as invalid: - String mess = MessageFormat.format( - MessageConstants.PDF_HUL_149.getMessage(), - key.getStringValue()); - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.PDF_HUL_149.getId(), mess); - info.setMessage(new ErrorMessage(message)); - throw new PdfInvalidException(message); // PDF-HUL-149 - // OR if this is not considered invalid - // return -1; - } - Destination dest = new Destination(destObj, this, true); - return dest.getPageDestObjNumber(); - } - - /* Build the user permission property., */ - protected Property buildUserPermProperty(int flags, String[] flagStrs) { - return buildBitmaskProperty(flags, "UserAccess", flagStrs, - "No permissions"); - } - - /** - * Add a string property, based on a dictionary entry - * with a string value, to a specified List. - */ - protected void addStringProperty(PdfDictionary dict, - List propList, String key, String propName) { - String propText = null; - PdfObject propObject = dict.get(key); - if (propObject instanceof PdfSimpleObject) { - Token tok = ((PdfSimpleObject) propObject).getToken(); - if (tok instanceof Literal) { - if (_encrypted) { - propText = ENCRYPTED; - } else { - propText = ((Literal) tok).getValue(); - } - propList.add( + if (ueObj != null) { + if (ueObj instanceof PdfSimpleObject) { + stdList.add(new Property(PROP_NAME_USERKEY_STRING, + PropertyType.STRING, + toHex(((PdfSimpleObject) ueObj).getRawBytes()))); + } + } else { + // if algValue is 5; UE is mandatory + throw new PdfInvalidException(MessageConstants.PDF_HUL_153, _parser.getOffset()); + } + } + _encryptList.add(new Property( + PROP_NAME_STANDARD_SECURITY_HANDLER, + PropertyType.PROPERTY, PropertyArity.LIST, stdList)); + } + PdfObject streamEncrypted = _encryptDict.get(DICT_KEY_STMF); + if (streamEncrypted instanceof PdfSimpleObject) { + _streamsEncrypted = true; + } + + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + return (e instanceof PdfInvalidException); + } + return true; + } + + protected boolean readDocInfoDict(RepInfo info) { + // Get the Info reference which we had before, and + // resolve it to the dictionary object. + if (_docInfoDictRef == null) { + return true; // Info is optional + } + _docInfoList = new ArrayList(9); + try { + _docInfoDict = (PdfDictionary) resolveIndirectObject( + _docInfoDictRef); + addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_TITLE, + PROP_NAME_TITLE); + addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_AUTHOR, + PROP_NAME_AUTHOR); + addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_SUBJECT, + PROP_NAME_SUBJECT); + addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_KEYWORDS, + PROP_NAME_KEYWORDS); + addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_CREATOR, + PROP_NAME_CREATOR); + addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_PRODUCER, + PROP_NAME_PRODUCER); + + // CreationDate requires string-to-date conversion + // ModDate does too + addDateProperty(_docInfoDict, _docInfoList, DICT_KEY_CREATION_DATE, + PROP_NAME_CREATION_DATE); + addDateProperty(_docInfoDict, _docInfoList, DICT_KEY_MODIFIED_DATE, + PROP_NAME_MODIFIED_DATE); + addStringProperty(_docInfoDict, _docInfoList, DICT_KEY_TRAPPED, + PROP_NAME_TRAPPED); + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // Keep parsing if it's only invalid + return (e instanceof PdfInvalidException); + } catch (Exception e) { + info.setWellFormed(false); + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_94.getMessage(), + e.getClass().getName()); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_94.getId(), mess); + info.setMessage(new ErrorMessage(message)); // PDF-HUL-94 + } + return true; + } + + protected boolean readDocumentTree(RepInfo info) { + try { + if (_pagesDictRef == null) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_95); // PDF-HUL-95 + } + + PdfObject pagesObj = resolveIndirectObject(_pagesDictRef); + if (pagesObj != null && !(pagesObj instanceof PdfDictionary)) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_97); // PDF-HUL-97 + } else if (pagesObj != null) { + + PdfDictionary pagesDict = (PdfDictionary) pagesObj; + + // Check that the pages dict has a key type and the types value is + // Pages + if (!checkTypeKey(pagesDict, info, KEY_VAL_PAGES, + MessageConstants.PDF_HUL_146, // PDF-HUL-146 + MessageConstants.PDF_HUL_144, // PDF-HUL-144 + MessageConstants.PDF_HUL_145)) { // PDF-HUL-145 + return false; + } + + _docTreeRoot = new PageTreeNode(this, null, pagesDict); + _docTreeRoot.buildSubtree(true, MAX_PAGE_TREE_DEPTH); + } + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // Continue parsing if it's only invalid + return (e instanceof PdfInvalidException); + } catch (ArrayIndexOutOfBoundsException excep) { + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_96, + _parser.getOffset())); // PDF-HUL-96 + info.setWellFormed(false); + return false; + } catch (Exception e) { + // Catch any odd exceptions + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_98.getMessage(), + e.getClass().getName()); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_98.getId(), mess); + info.setMessage(new ErrorMessage(message, _parser.getOffset())); // PDF-HUL-98 + info.setWellFormed(false); + return false; + } + return true; + } + + protected boolean readPageLabelTree(RepInfo info) { + // the page labels number tree is optional. + try { + if (_pageLabelDict != null) { + _pageLabelRoot = new PageLabelNode(this, null, _pageLabelDict); + _pageLabelRoot.buildSubtree(); + } + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // Continue parsing if it's only invalid + return (e instanceof PdfInvalidException); + } catch (Exception e) { + info.setWellFormed(false); + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_99.getMessage(), + e.getClass().getName()); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_99.getId(), mess); + info.setMessage(new ErrorMessage(message)); // PDF-HUL-99 + return false; + } + return true; // always succeeds + } + + protected boolean readXMPData(RepInfo info) { + try { + PdfStream metadata = (PdfStream) resolveIndirectObject( + _docCatDict.get(DICT_KEY_METADATA)); + if (metadata == null) { + return true; // Not required + } + // PdfDictionary metaDict = metadata.getDict (); + + // Create an InputSource to feed the parser. + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + XMLReader parser = factory.newSAXParser().getXMLReader(); + PdfXMPSource src = new PdfXMPSource(metadata, getFile()); + XMPHandler handler = new XMPHandler(); + parser.setContentHandler(handler); + parser.setErrorHandler(handler); + + // We have to parse twice. The first time, we may get + // an encoding change as part of an exception thrown. If this + // happens, we create a new InputSource with the encoding, and + // continue. + try { + parser.parse(src); + _xmpProp = src.makeProperty(); + } catch (SAXException se) { + String msg = se.getMessage(); + if (msg != null && msg.startsWith(ENCODING_PREFIX)) { + String encoding = msg.substring(5); + try { + src = new PdfXMPSource(metadata, getFile(), encoding); + parser.parse(src); + _xmpProp = src.makeProperty(); + } catch (UnsupportedEncodingException uee) { + _logger.log(Level.INFO, + "Attempt to use explicit encoding to parse XMP metadata failed.", + uee); + throw new PdfInvalidException( + MessageConstants.PDF_HUL_100); // PDF-HUL-100 + } + } + } + + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // Continue parsing if it's only invalid + return (e instanceof PdfInvalidException); + } catch (Exception e) { + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_101, // PDF-HUL-101 + _parser.getOffset())); + info.setValid(false); + return false; + } + return true; + } + + protected void findExternalStreams(RepInfo info) throws IOException { + _extStreamsList = new LinkedList(); + // stop processing if there is no root for the document tree + if (_docTreeRoot == null) + return; + _docTreeRoot.startWalk(); + try { + for (;;) { + // Get all the page objects in the document sequentially + PageObject page = _docTreeRoot.nextPageObject(); + if (page == null) { + break; + } + // Get the streams for the page and walk through them + List streams = page.getContentStreams(); + if (streams != null) { + ListIterator streamIter = streams.listIterator(); + while (streamIter.hasNext()) { + PdfStream stream = streamIter.next(); + String specStr = stream.getFileSpecification(); + if (specStr != null) { + Property prop = new Property(PROP_NAME_FILE, + PropertyType.STRING, specStr); + _extStreamsList.add(prop); + } + } + } + } + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage())); + } catch (Exception e) { + info.setWellFormed(false); + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_102.getMessage(), + e.getClass().getName()); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_102.getId(), mess); + info.setMessage(new ErrorMessage(message)); // PDF-HUL-102 + } + } + + /** + * Locates the filters in the content stream dictionaries + * and generate a list of unique pipelines. + * + * @return false if the filter structure is + * defective. + */ + protected boolean findFilters(RepInfo info) throws IOException { + _filtersList = new LinkedList(); + // stop processing if there is no root for the document tree + if (_docTreeRoot == null) + return false; + _docTreeRoot.startWalk(); + try { + for (;;) { + // Get all the page objects in the document sequentially + PageObject page = _docTreeRoot.nextPageObject(); + if (page == null) { + break; + } + // Get the streams for the page and walk through them + List streams = page.getContentStreams(); + if (streams != null) { + ListIterator streamIter = streams.listIterator(); + while (streamIter.hasNext()) { + PdfStream stream = streamIter.next(); + Filter[] filters = stream.getFilters(); + extractFilters(filters, stream); + } + } + } + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + // Continue parsing if it's only invalid + return (e instanceof PdfInvalidException); + } + return true; + } + + /** + * Finds the filters in a stream or array object which is the value + * of a stream's Filter key, and put them in _filtersList + * if a duplicate isn't there already. If the name is + * "Crypt", appends a colon and the name if available. + * Returns the filter string whether it's added or not, + * or null if there are no filters. + */ + protected String extractFilters(Filter[] filters, PdfStream stream) { + /* + * Concatenate the names into a string of names separated + * by spaces. + */ + int len = filters.length; + if (len == 0) { + return null; + } + StringBuffer buf = new StringBuffer(); + for (int i = 0; i < len; i++) { + Filter filt = filters[i]; + String fname = filt.getFilterName(); + buf.append(fname); + /* If it's a Crypt filter, add the crypt name. */ + if (FILTER_NAME_CRYPT.equals(fname)) { + String cname = filt.getNameParam(); + if (cname != null) { + buf.append(":" + cname); + } + } + if (i < len - 1) { + buf.append(' '); + } + } + String filterStr = buf.toString(); + boolean unique = true; + // Check for uniqueness. + Iterator iter = _filtersList.iterator(); + while (iter.hasNext()) { + Property p = iter.next(); + String s = (String) p.getValue(); + if (s.equals(filterStr)) { + unique = false; + break; + } + } + if (filterStr != null && unique) { + Property prop = new Property(PROP_NAME_FILTER_PIPELINE, + PropertyType.STRING, filterStr); + _filtersList.add(prop); + } + return filterStr; + } + + protected void findImages(RepInfo info) throws IOException { + _imagesList = new LinkedList(); + // needed if object streams are encrypted + if (_docTreeRoot == null) { + return; + } + _docTreeRoot.startWalk(); + try { + for (;;) { + // Get all the page objects in the document sequentially + PageObject page = _docTreeRoot.nextPageObject(); + if (page == null) { + break; + } + // Get the resources for the page and look for image XObjects + PdfDictionary rsrc = page.getResources(); + if (rsrc != null) { + PdfDictionary xo = (PdfDictionary) resolveIndirectObject( + rsrc.get(RESOURCE_NAME_XOBJECT)); + if (xo != null) { + Iterator iter = xo.iterator(); + while (iter.hasNext()) { + // Get an XObject and check if it's an image. + _logger.info("Getting image"); + PdfDictionary xobdict = null; + PdfObject xob = resolveIndirectObject(iter.next()); + if (xob instanceof PdfStream) { + xobdict = ((PdfStream) xob).getDict(); + } + if (xobdict != null) { + PdfSimpleObject subtype = (PdfSimpleObject) xobdict + .get(DICT_KEY_XOBJ_SUBTYPE); + if (XOBJ_SUBTYPE_IMAGE + .equals(subtype.getStringValue())) { + // It's an image XObject. Report stuff. + _logger.info("Image XObject"); + List imgList = new ArrayList( + 10); + Property prop = new Property( + PROP_NAME_IMAGE, + PropertyType.PROPERTY, + PropertyArity.LIST, imgList); + NisoImageMetadata niso = new NisoImageMetadata(); + imgList.add(new Property( + PROP_NAME_NISO_IMAGE_MD, + PropertyType.NISOIMAGEMETADATA, + niso)); + PdfObject widthBase = xobdict + .get(DICT_KEY_WIDTH); + PdfSimpleObject widObj = (PdfSimpleObject) resolveIndirectObject( + widthBase); + PdfObject heightBase = xobdict + .get(DICT_KEY_HEIGHT); + PdfSimpleObject htObj = (PdfSimpleObject) resolveIndirectObject( + heightBase); + if (widObj != null || htObj != null) { + niso.setImageWidth(widObj.getIntValue()); + niso.setImageLength(htObj.getIntValue()); + } else { + info.setWellFormed(false); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_159.getId(), + MessageConstants.PDF_HUL_159.getMessage()); + info.setMessage(new ErrorMessage(message)); // PDF-HUL-159 + } + // Check for filters to add to the filter + // list + Filter[] filters = ((PdfStream) xob) + .getFilters(); + // Try to derive the image MIME type from + // filter names + String mimeType = imageMimeFromFilters( + filters); + niso.setMimeType(mimeType); + String filt = extractFilters(filters, + (PdfStream) xob); + if (filt != null) { + // If the filter is one which the NISO + // schema + // knows about, put it in the NISO + // metadata, + // otherwise put it in a Filter + // property. + int nisoFilt = nameToNiso(filt, + compressionStrings, + compressionValues); + if (nisoFilt >= 0) { + /* + * If it's 2, it's a CCITTFaxDecode + * filter. There may be an optional + * K entry that can change the + * value. + */ + PdfObject parms = xobdict.get( + DICT_KEY_DECODE_PARAMS); + if (parms != null) { + PdfSimpleObject kobj = null; + if (parms instanceof PdfDictionary) { + PdfDictionary pdict = (PdfDictionary) parms; + kobj = (PdfSimpleObject) resolveIndirectObject( + pdict.get(DICT_KEY_K)); + } + /* + * Note that the DecodeParms + * value may also be an array + * of dictionaries. We are not + * handling that contingency. + */ + if (kobj != null) { + int k = kobj.getIntValue(); + if (k < 0) { + nisoFilt = 4; + } else if (k > 0) { + nisoFilt = 3; + } + } + } + niso.setCompressionScheme(nisoFilt); + } else { + imgList.add(new Property( + PROP_NAME_FILTER, + PropertyType.STRING, filt)); + } + } else { + niso.setCompressionScheme(1); // no + // filter + } + + // Check for color space info + PdfObject colorSpc = xobdict + .get(DICT_KEY_COLOR_SPACE); + if (colorSpc != null) { + String colorName = null; + if (colorSpc instanceof PdfSimpleObject) { + colorName = ((PdfSimpleObject) colorSpc) + .getStringValue(); + } else if (colorSpc instanceof PdfArray) { + Vector vec = ((PdfArray) colorSpc) + .getContent(); + // Use the first element, which is + // the color space family + PdfSimpleObject fam = (PdfSimpleObject) vec + .elementAt(0); + colorName = fam.getStringValue(); + } + if (colorName != null) { + int nisoSpace = nameToNiso( + colorName, + colorSpaceStrings, + colorSpaceValues); + if (nisoSpace >= 0) { + niso.setColorSpace(nisoSpace); + } else { + imgList.add(new Property( + PROP_NAME_COLOR_SPACE, + PropertyType.STRING, + colorName)); + } + } + } + + PdfSimpleObject bpc = (PdfSimpleObject) xobdict + .get(DICT_KEY_BITS_PER_COMPONENT); + if (bpc != null) { + // imgList.add(new + // Property(DICT_KEY_BITS_PER_COMPONENT, + // PropertyType.INTEGER, + // new Integer (bpc.getIntValue()))); + niso.setBitsPerSample(new int[] { + bpc.getIntValue() }); + } + + PdfSimpleObject intent = (PdfSimpleObject) xobdict + .get(DICT_KEY_INTENT); + if (intent != null) { + imgList.add(new Property( + PROP_NAME_INTENT, + PropertyType.STRING, + intent.getStringValue())); + } + + PdfSimpleObject imgmsk = (PdfSimpleObject) xobdict + .get(DICT_KEY_IMAGE_MASK); + if (imgmsk != null) { + boolean b = imgmsk.isTrue(); + imgList.add(new Property( + PROP_NAME_IMAGE_MASK, + PropertyType.BOOLEAN, + Boolean.valueOf(b))); + } + + PdfArray dcd = (PdfArray) xobdict + .get(DICT_KEY_DECODE); + if (dcd != null) { + Vector dcdvec = dcd + .getContent(); + List dcdlst = new ArrayList( + dcdvec.size()); + Iterator diter = dcdvec + .iterator(); + while (diter.hasNext()) { + PdfSimpleObject d = (PdfSimpleObject) diter + .next(); + dcdlst.add(new Integer( + d.getIntValue())); + } + imgList.add(new Property( + PROP_NAME_DECODE, + PropertyType.INTEGER, + PropertyArity.LIST, dcdlst)); + } + + PdfSimpleObject intrp = (PdfSimpleObject) xobdict + .get(DICT_KEY_INTERPOLATE); + if (intrp != null) { + boolean b = intrp.isTrue(); + imgList.add(new Property( + PROP_NAME_INTERPOLATE, + PropertyType.BOOLEAN, + Boolean.valueOf(b))); + } + + PdfSimpleObject nam = (PdfSimpleObject) xobdict + .get(DICT_KEY_NAME); + if (nam != null) { + imgList.add(new Property(PROP_NAME_NAME, + PropertyType.STRING, + nam.getStringValue())); + } + + PdfSimpleObject id = (PdfSimpleObject) resolveIndirectObject( + xobdict.get(DICT_KEY_ID)); + if (id != null) { + String idstr = toHex( + id.getStringValue()); + imgList.add(new Property(PROP_NAME_ID, + PropertyType.STRING, idstr)); + } + + _imagesList.add(prop); + } + + } + } + } + } + } + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + } catch (Exception e) { + info.setWellFormed(false); + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_103.getMessage(), + e.getClass().getName()); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_103.getId(), mess); + info.setMessage(new ErrorMessage(message)); // PDF-HUL-103 + } + } + + /* + * Convert a Filter name to a NISO compression scheme value. + * If the name is unknown to NISO, return -1. + */ + protected int nameToNiso(String name, String[] nameArray, int[] valArray) { + for (int i = 0; i < nameArray.length; i++) { + if (nameArray[i].equals(name)) { + return valArray[i]; + } + } + return -1; // no match + } + + protected void findFonts(RepInfo info) throws IOException { + _type0FontsMap = new HashMap(); + _type1FontsMap = new HashMap(); + _trueTypeFontsMap = new HashMap(); + _mmFontsMap = new HashMap(); + _type3FontsMap = new HashMap(); + _cid0FontsMap = new HashMap(); + _cid2FontsMap = new HashMap(); + // needed if object streams are encrypted + if (_docTreeRoot == null) { + return; + } + try { + _docTreeRoot.startWalk(); + for (;;) { + // This time we need all the page objects and page tree + // nodes, because resources can be inherited from + // page tree nodes. + DocNode node = _docTreeRoot.nextDocNode(); + if (node == null) { + break; + } + // Get the fonts for the node + PdfDictionary fonts = null; + fonts = node.getFontResources(); + if (fonts != null) { + // In order to make sure we have a collection of + // unique fonts, we store them in a map keyed by + // object number. + Iterator fontIter = fonts.iterator(); + while (fontIter.hasNext()) { + PdfObject fontRef = fontIter.next(); + PdfObject font = resolveIndirectObject(fontRef); + if (font instanceof PdfDictionary) { + addFontToMap((PdfDictionary) font); + } else { + // Expected a dictionary + info.setWellFormed(false); + info.setMessage(new ErrorMessage( + MessageConstants.PDF_HUL_104, // PDF-HUL-104 + _parser.getOffset())); + return; + } + // If we've been directed appropriately, + // we accumulate the information, but don't + // report it. In that case, we post a message + // just once to that effect. + if (!_skippedFontsReported && !_showFonts + && _verbosity != Module.MAXIMUM_VERBOSITY) { + info.setMessage(new InfoMessage( + MessageConstants.PDF_HUL_105)); // PDF-HUL-105 + _skippedFontsReported = true; + } + } + } + } + } catch (PdfException e) { + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + return; + } catch (Exception e) { + // Unexpected exception. + _logger.log(Level.WARNING, + MessageConstants.PDF_HUL_106.getMessage(), e); + info.setWellFormed(false); + info.setMessage(new ErrorMessage(MessageConstants.PDF_HUL_106, // PDF-HUL-106 + e.toString(), _parser.getOffset())); + return; + } + } + + /** + * Add the font to the appropriate map, and return its subtype. + * If we've exceeded the maximum number of fonts, then ignore it. + */ + protected String addFontToMap(PdfDictionary font) { + if (++_nFonts > maxFonts) { + return null; + } + String subtypeStr = null; + try { + PdfSimpleObject subtype = (PdfSimpleObject) font + .get(DICT_KEY_FONT_SUBTYPE); + subtypeStr = subtype.getStringValue(); + if (FONT_TYPE0.equals(subtypeStr)) { + _type0FontsMap.put(new Integer(font.getObjNumber()), font); + // If the font is Type 0, we must go + // through its descendant fonts + PdfObject desc0 = font.get(DICT_KEY_DESCENDANT_FONTS); + PdfArray descendants = (PdfArray) resolveIndirectObject(desc0); + Vector subfonts = descendants.getContent(); + Iterator subfontIter = subfonts.iterator(); + while (subfontIter.hasNext()) { + PdfObject subfont = subfontIter.next(); + subfont = resolveIndirectObject(subfont); + addFontToMap((PdfDictionary) subfont); + } + } else if (FONT_TYPE1.equals(subtypeStr)) { + _type1FontsMap.put(new Integer(font.getObjNumber()), font); + } else if (FONT_MM_TYPE1.equals(subtypeStr)) { + _mmFontsMap.put(new Integer(font.getObjNumber()), font); + } else if (FONT_TYPE3.equals(subtypeStr)) { + _type3FontsMap.put(new Integer(font.getObjNumber()), font); + } else if (FONT_TRUE_TYPE.equals(subtypeStr)) { + _trueTypeFontsMap.put(new Integer(font.getObjNumber()), font); + } else if (FONT_CID_TYPE0.equals(subtypeStr)) { + _cid0FontsMap.put(new Integer(font.getObjNumber()), font); + } else if (FONT_CID_TYPE2.equals(subtypeStr)) { + _cid2FontsMap.put(new Integer(font.getObjNumber()), font); + } + return subtypeStr; + } catch (Exception e) { + return null; + } + } + + /****************************************************************** + * PRIVATE CLASS METHODS. + ******************************************************************/ + + protected static String toHex(String s) { + StringBuffer buffer = new StringBuffer("0x"); + + int len = s.length(); + for (int i = 0; i < len; i++) { + String h = Integer.toHexString(s.charAt(i)); + if (h.length() < 2) { + buffer.append("0"); + } + buffer.append(h); + } + + return buffer.toString(); + } + + protected static String toHex(Vector v) { + StringBuffer buffer = new StringBuffer("0x"); + + int len = v.size(); + for (int i = 0; i < len; i++) { + int hdigit = v.elementAt(i).intValue(); + String h = Integer.toHexString(hdigit); + if (h.length() < 2) { + buffer.append("0"); + } + buffer.append(h); + } + + return buffer.toString(); + } + + /** + * If the argument is an indirect object reference, + * returns the object it resolves to, otherwise returns + * the object itself. In particular, calling with null will + * return null. + */ + public PdfObject resolveIndirectObject(PdfObject obj) + throws PdfException, IOException { + if (obj instanceof PdfIndirectObj) { + int objIndex = ((PdfIndirectObj) obj).getObjNumber(); + /* + * Here we need to allow for the possibility that the + * object is compressed in an object stream. That means + * creating a new structure (call it _xref2) that contains + * the stream object number and offset whenever _xref[objIndex] + * is negative. _xref2 will have to contain the content + * stream object number (which will itself have to be + * resolved) and the offset into the object stream. + */ + return getObject(objIndex, MAX_OBJ_STREAM_DEPTH); + } + return obj; + } + + /** + * Returns an object of a given number. This may involve + * recursion into object streams, in which case it calls itself. + * + * @param objIndex + * The object number to look up + * @param recGuard + * The maximum permitted number of recursion levels; + * no particular value is required, but 30 or more + * should avoid false exceptions. + */ + protected PdfObject getObject(int objIndex, int recGuard) + throws PdfException, IOException { + /* Guard against infinite recursion */ + if (recGuard <= 0) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_107); + } + long offset = _xref[objIndex]; + if (offset == 0) { + return null; // This is considered legitimate by the spec + } + if (offset < 0) { + return getObjectFromStream(objIndex, recGuard); + } + _parser.seek(offset); + PdfObject obj = _parser.readObjectDef(this); + // + // Experimental carl@openpreservation.org 2018-03-14 + // + // Previously all object numbers (ids) were overwritten even if they'd + // previously been assigned. + // + // This is caused by a little confusion where the object ID and the + // index of the _xref array are used interchangeably when they're not + // the same thing. There's an assumption when for the _xref array + // that the objects will have continuous numeric object numbers. This + // means that the object number and array position will always be the + // same. The setting of the object number meant that the wrong object + // could + // be returned with the id changed to match the id requested. + // + // My guess is that the assignment was put in to ensure that an + // object that escaped initialisation had an object number. If that's + // the case then the code below will still allow that to happen but + // will prevent assigned numbers from been overwritten by the xref array + // position. + if (obj.getObjNumber() == -1) { + obj.setObjNumber(objIndex); + } + return obj; + } + + /** + * Return the RandomAccessFile being read. + */ + public RandomAccessFile getFile() { + return _raf; + } + + /** + * Returns the catalog dictionary object. + */ + public PdfDictionary getCatalogDict() { + return _docCatDict; + } + + /** + * Returns the trailer dictionary object. + */ + public PdfDictionary getTrailerDict() { + return _trailerDict; + } + + /** + * Returns the viewer preferences dictionary object. + */ + public PdfDictionary getViewPrefDict() { + return _viewPrefDict; + } + + /** + * Returns the outlines dictionary object. + */ + public PdfDictionary getOutlineDict() { + return _outlineDict; + } + + /** + * Get a font map. The map returned is determined by the selector. + * Any other value returns null. + */ + public Map getFontMap(int selector) { + switch (selector) { + case F_TYPE0: + return _type0FontsMap; + case F_TYPE1: + return _type1FontsMap; + case F_TT: + return _mmFontsMap; + case F_TYPE3: + return _type3FontsMap; + case F_MM1: + return _mmFontsMap; + case F_CID0: + return _cid0FontsMap; + case F_CID2: + return _cid2FontsMap; + default: + return null; + } + } + + /** + * Return a List of all the font maps. Together, these contain + * all the fonts and subfonts in the document. Some of the maps + * may be null. + */ + public List> getFontMaps() { + List> lst = new ArrayList>( + 7); + lst.add(_type0FontsMap); + lst.add(_type1FontsMap); + lst.add(_mmFontsMap); + lst.add(_type3FontsMap); + lst.add(_trueTypeFontsMap); + lst.add(_cid0FontsMap); + lst.add(_cid2FontsMap); + return lst; + } + + /** + * Returns a NameTreeNode for the EmbeddedFiles entry of the + * Names dictionary. Returns null if there isn't one. + */ + public NameTreeNode getEmbeddedFiles() { + return _embeddedFiles; + } + + /** + * Add the various font lists as a fonts property. Note: only add + * the "Fonts" property if there are, in fact, fonts defined. + */ + protected void addFontsProperty(List metadataList) { + List fontTypesList = new LinkedList(); + Property fontp = null; + if (_type0FontsMap != null && !_type0FontsMap.isEmpty()) { + try { + fontp = buildFontProperty(PROP_NAME_FONT_TYPE0, _type0FontsMap, + F_TYPE0); + fontTypesList.add(fontp); + } catch (ClassCastException e) { + // Report an error here? + } + } + if (_type1FontsMap != null && !_type1FontsMap.isEmpty()) { + try { + fontp = buildFontProperty(PROP_NAME_FONT_TYPE1, _type1FontsMap, + F_TYPE1); + fontTypesList.add(fontp); + } catch (ClassCastException e) { + // Report an error here? + } + } + if (_trueTypeFontsMap != null && !_trueTypeFontsMap.isEmpty()) { + try { + fontp = buildFontProperty(PROP_NAME_FONT_TRUE_TYPE, + _trueTypeFontsMap, F_TT); + fontTypesList.add(fontp); + } catch (ClassCastException e) { + // Report an error here? + } + } + if (_type3FontsMap != null && !_type3FontsMap.isEmpty()) { + try { + fontp = buildFontProperty(PROP_NAME_FONT_TYPE3, _type3FontsMap, + F_TYPE3); + fontTypesList.add(fontp); + } catch (ClassCastException e) { + } + } + if (_mmFontsMap != null && !_mmFontsMap.isEmpty()) { + try { + fontp = buildFontProperty(PROP_NAME_FONT_MM_TYPE1, _mmFontsMap, + F_MM1); + fontTypesList.add(fontp); + } catch (ClassCastException e) { + } + } + if (_cid0FontsMap != null && !_cid0FontsMap.isEmpty()) { + try { + fontp = buildFontProperty(PROP_NAME_FONT_CID_TYPE0, + _cid0FontsMap, F_CID0); + fontTypesList.add(fontp); + } catch (ClassCastException e) { + } + } + if (_cid2FontsMap != null && !_cid2FontsMap.isEmpty()) { + try { + fontp = buildFontProperty(PROP_NAME_FONT_CID_TYPE2, + _cid2FontsMap, F_CID2); + fontTypesList.add(fontp); + } catch (ClassCastException e) { + } + } + if (fontTypesList.size() > 0) { + metadataList.add(new Property(PROP_NAME_FONTS, + PropertyType.PROPERTY, PropertyArity.LIST, fontTypesList)); + } + } + + /* Build Pages property, with associated subproperties. */ + protected void addPagesProperty(List metadataList, RepInfo info) { + _pagesList = new LinkedList(); + _pageSeqMap = new HashMap(500); + // needed if object streams are encrypted + if (_docTreeRoot == null) { + return; + } + try { + _docTreeRoot.startWalk(); + int pageIndex = 0; + // Start the pipe with two entries. + // We always need to have the current and the next + // entry from the page label tree in order to determine + // the lower and upper bounds of the applicable range. + // If the first entry has a bound greater than zero, + // that appears to be an undefined situation, so we + // always treat the first entry as starting at zero. + if (_pageLabelRoot != null) { + if (!_pageLabelRoot.findNextKeyValue()) { + throw new PdfMalformedException( + MessageConstants.PDF_HUL_111); // PDF-HUL-111 + } + + _pageLabelRoot.findNextKeyValue(); + } + for (;;) { + // Get all the page objects in the document sequentially + // Have to do this in two passes so that link + // destinations can be properly reported. + PageObject page = _docTreeRoot.nextPageObject(); + if (page == null) { + break; + } + _pageSeqMap.put(new Integer(page.getDict().getObjNumber()), + new Integer(pageIndex + 1)); + } + _docTreeRoot.startWalk(); + for (;;) { + PageObject page = _docTreeRoot.nextPageObject(); + if (page == null) { + break; + } + Property p = buildPageProperty(page, pageIndex++, info); + _pagesList.add(p); + } + if (_showPages || _verbosity == Module.MAXIMUM_VERBOSITY) { + Property prop = new Property(PROP_NAME_PAGES, + PropertyType.PROPERTY, PropertyArity.LIST, _pagesList); + metadataList.add(prop); + } else { + if (!_skippedPagesReported) { + info.setMessage( + new InfoMessage(MessageConstants.PDF_HUL_112)); // PDF-HUL-112 + _skippedPagesReported = true; + } + } + } catch (PdfException e) { + + e.disparage(info); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + return; + } + } + + /* Build a subproperty for one PageObject. */ + protected Property buildPageProperty(PageObject page, int idx, RepInfo info) + throws PdfException { + List pagePropList = new ArrayList(4); + try { + // Foo on Java's inability to return values through + // parameters. Passing an array is a crock to achieve + // that effect. + int[] nominalNum = new int[1]; + Property plProp = buildPageLabelProperty(page, idx, nominalNum); + if (plProp != null) { + pagePropList.add(plProp); + } + if (plProp == null || nominalNum[0] != idx + 1) { + // Page sequence is different from label, or + // there is no label. Make it 1-based. + pagePropList.add(new Property(PROP_NAME_SEQUENCE, + PropertyType.INTEGER, new Integer(idx + 1))); + + } + } catch (PdfException e) { + throw e; + } catch (Exception f) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_113); // PDF-HUL-113 + } + + try { + List annotsList = new LinkedList(); + PdfArray annots = page.getAnnotations(); + if (annots != null) { + Vector contents = annots.getContent(); + for (int i = 0; i < contents.size(); i++) { + PdfObject annot = resolveIndirectObject( + contents.elementAt(i)); + if (annot instanceof PdfDictionary) { + annotsList.add(buildAnnotProperty((PdfDictionary) annot, + info)); + } else if (annot instanceof PdfSimpleObject + && ((PdfSimpleObject) annot).getToken() instanceof Comment) { + // ignore Comments + continue; + + } else { + // There are annotations which aren't dictionaries. I've + // run into this, + // but it violates the spec as far as I can tell. + throw new PdfInvalidException( + MessageConstants.PDF_HUL_114); // PDF-HUL-114 + } + } + if (!annotsList.isEmpty()) { + if (_showAnnotations + || _verbosity == Module.MAXIMUM_VERBOSITY) { + Property annotProp = new Property(PROP_NAME_ANNOTATIONS, + PropertyType.PROPERTY, PropertyArity.LIST, + annotsList); + pagePropList.add(annotProp); + } else { + // We don't report annotations if we got here, + // but we do report that we don't report them. + if (!_skippedAnnotationsReported) { + info.setMessage(new InfoMessage( + MessageConstants.PDF_HUL_115)); // PDF-HUL-115 + _skippedAnnotationsReported = true; + } + } + } + } + } catch (PdfException e) { + throw e; + } catch (Exception f) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_116); // PDF-HUL-116 + } + + try { + // Rotation property is inheritable + PdfObject tempObj = page.get(DICT_KEY_ROTATE, + true); + PdfSimpleObject rot = null; + if (tempObj != null && tempObj instanceof PdfSimpleObject) { + rot = (PdfSimpleObject) tempObj; + } else if (tempObj != null && tempObj instanceof PdfIndirectObj) { + rot = (PdfSimpleObject) ((PdfIndirectObj) tempObj) + .getObject(); + } + if (rot != null && rot.getIntValue() != 0) { + pagePropList.add(new Property(PROP_NAME_ROTATE, + PropertyType.INTEGER, new Integer(rot.getIntValue()))); + } + + // UserUnit property (1.6), not inheritable + PdfSimpleObject uu = (PdfSimpleObject) page.get(DICT_KEY_USER_UNIT, + false); + if (uu != null) { + pagePropList.add(new Property(PROP_NAME_USER_UNIT, + PropertyType.DOUBLE, new Double(rot.getDoubleValue()))); + } + // Viewport dictionaries (1.6), not inheritable + PdfArray vp = (PdfArray) page.get(DICT_KEY_VIEWPORT, false); + if (vp != null) { + Vector vpv = vp.getContent(); + Iterator iter = vpv.iterator(); + List vplist = new ArrayList(vpv.size()); + while (iter.hasNext()) { + PdfDictionary vpd = (PdfDictionary) resolveIndirectObject( + iter.next()); + PdfObject vpdbb = vpd.get(DICT_KEY_BBOX); + List vpPropList = new ArrayList(); + vpPropList.add(makeRectProperty( + (PdfArray) resolveIndirectObject(vpdbb), + DICT_KEY_BBOX)); + PdfObject meas = vpd.get(DICT_KEY_MEASURE); + if (meas instanceof PdfDictionary) { + vpPropList.add( + buildMeasureProperty((PdfDictionary) meas)); + // No, that's wrong -- the Viewport property itself + // needs to be a list with a bounding box. + } + vplist.add(new Property(PROP_NAME_VIEWPORT, + PropertyType.PROPERTY, PropertyArity.LIST, + vpPropList)); + } + pagePropList.add(new Property(PROP_NAME_VIEWPORTS, + PropertyType.PROPERTY, PropertyArity.LIST, vplist)); + } + // Thumbnail -- we just report if it's there. It's a + // non-inheritable property + PdfObject thumb = page.get(DICT_KEY_THUMB, false); + if (thumb != null) { + pagePropList.add(new Property(PROP_NAME_THUMB, + PropertyType.BOOLEAN, Boolean.TRUE)); + } + return new Property(PROP_NAME_PAGE, PropertyType.PROPERTY, + PropertyArity.LIST, pagePropList); + } catch (PdfException e) { + throw e; + } catch (Exception f) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_117); // PDF-HUL-117 + } + } + + /* + * Build a subproperty of a subproperty for page labels. + * The nomNumRef argument is a crock for returning the + * nominal number; element 0 of the array is replaced + * by the nominal number of the page. + */ + protected Property buildPageLabelProperty(PageObject page, int pageIndex, + int[] nomNumRef) throws PdfException { + if (_pageLabelRoot == null) { + return null; // no page label info + } + + // Note that our "current" page is the page label tree's + // "previous" key. Sorry about that... + int curFirstPage = _pageLabelRoot.getPrevKey(); + int nextFirstPage = _pageLabelRoot.getCurrentKey(); + try { + // If we're onto the next page range, advance our pointers. + if (pageIndex >= nextFirstPage) { + _pageLabelRoot.findNextKeyValue(); + curFirstPage = nextFirstPage; + } + PdfDictionary pageLabelDict = (PdfDictionary) resolveIndirectObject( + _pageLabelRoot.getPrevValue()); + StringBuffer labelText = new StringBuffer(); + PdfSimpleObject prefixObj = (PdfSimpleObject) pageLabelDict + .get(DICT_KEY_P); + if (prefixObj != null) { + labelText.append(prefixObj.getStringValue()); + } + PdfSimpleObject firstPageObj = (PdfSimpleObject) pageLabelDict + .get("St"); + // Sequence start value defaults to 1 if there's no start value + int firstPageVal = ((firstPageObj != null) + ? firstPageObj.getIntValue() + : 1); + int nominalPage = pageIndex - curFirstPage + firstPageVal; + if (nominalPage <= 0) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_118); // pDF-HUL-118 + } + nomNumRef[0] = nominalPage; + + // Get the numbering style. If there is no numbering + // style entry, the label consists only of the prefix. + PdfSimpleObject numStyleObj = (PdfSimpleObject) pageLabelDict + .get("S"); + String numStyle; + if (numStyleObj == null) { + numStyle = null; + } else { + numStyle = numStyleObj.getStringValue(); + } + if ("D".equals(numStyle)) { + // Nice, simple decimal numbers + labelText.append(nominalPage); + } else if ("R".equals(numStyle)) { + // Upper case roman numerals + labelText.append(PageLabelNode.intToRoman(nominalPage, true)); + } else if ("r".equals(numStyle)) { + // Lower case roman numerals + labelText.append(PageLabelNode.intToRoman(nominalPage, false)); + } else if ("A".equals(numStyle)) { + // Uppercase letters (A-Z, AA-ZZ, ...) + labelText.append(PageLabelNode.intToBase26(nominalPage, true)); + } else if ("a".equals(numStyle)) { + // Lowercase letters (a-z, aa-zz, ...) + labelText.append(PageLabelNode.intToBase26(nominalPage, false)); + } + // It screws up the PDF output if we have a blank Label property. + if (labelText.length() == 0) { + labelText.append(EMPTY_LABEL_PROPERTY); + } + return new Property(PROP_NAME_LABEL, PropertyType.STRING, + labelText.toString()); + } catch (Exception e) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_119); // PDF-HUL-119 + } + } + + /* Build a subproperty for a measure dictionary. */ + protected Property buildMeasureProperty(PdfDictionary meas) { + List plist = new ArrayList(); + PdfObject itemObj = meas.get(DICT_KEY_XOBJ_SUBTYPE); + if (itemObj instanceof PdfSimpleObject) { + plist.add(new Property(PROP_NAME_SUBTYPE, PropertyType.STRING, + ((PdfSimpleObject) itemObj).getStringValue())); + } + itemObj = meas.get(DICT_KEY_R); + if (itemObj instanceof PdfSimpleObject) { + plist.add(new Property(PROP_NAME_RATIO, PropertyType.STRING, + ((PdfSimpleObject) itemObj).getStringValue())); + } + // All kinds of stuff I could add -- limit it to the required + // X, Y, D and A arrays. + itemObj = meas.get("X"); + if (itemObj instanceof PdfArray) { + plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, "X")); + } + itemObj = meas.get("Y"); + if (itemObj instanceof PdfArray) { + plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, "Y")); + } + itemObj = meas.get("D"); + if (itemObj instanceof PdfArray) { + plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, PROP_NAME_DISTANCE)); + } + itemObj = meas.get("A"); + if (itemObj instanceof PdfArray) { + plist.add(buildNumberFormatArrayProperty((PdfArray) itemObj, PROP_NAME_AREA)); + } + return new Property(PROP_NAME_MEASURE, PropertyType.PROPERTY, + PropertyArity.LIST, plist); + } + + /* Build a subproperty for a number format array. */ + private Property buildNumberFormatArrayProperty(PdfArray arr, String propertyName) { + Vector v = arr.getContent(); + List alist = new ArrayList<>(); + for (int i = 0; i < v.size(); i++) { + PdfObject xobj = v.elementAt(i); + if (xobj instanceof PdfDictionary) { + PdfObject obj = ((PdfDictionary) xobj).get("U"); + if (obj instanceof PdfSimpleObject) { + alist.add(new Property("Name", PropertyType.DOUBLE, ((PdfSimpleObject) obj).getDoubleValue())); + } + obj = ((PdfDictionary) xobj).get("C"); + if (obj instanceof PdfSimpleObject) { + alist.add( + new Property("Coefficient", PropertyType.STRING, ((PdfSimpleObject) obj).getStringValue())); + } + } + } + return new Property(propertyName, PropertyType.PROPERTY, PropertyArity.LIST, alist); + } + + /* Build a subproperty of a subproperty for an annotation. */ + protected Property buildAnnotProperty(PdfDictionary annot, RepInfo info) + throws PdfException { + List propList = new ArrayList(7); + PdfObject itemObj; + try { + // Subtype is required + itemObj = annot.get(DICT_KEY_XOBJ_SUBTYPE); + propList.add(new Property(PROP_NAME_SUBTYPE, PropertyType.STRING, + ((PdfSimpleObject) itemObj).getStringValue())); + + // Contents is optional for some subtypes, required for + // others. We consider it optional here. + itemObj = annot.get(DICT_KEY_CONTENTS); + if (itemObj != null) { + propList.add( + new Property(PROP_NAME_CONTENTS, PropertyType.STRING, + _encrypted ? ENCRYPTED + : ((PdfSimpleObject) itemObj) + .getStringValue())); + } + + // Rectangle is required, and must be in the rectangle format + itemObj = annot.get(DICT_KEY_RECT); + propList.add(makeRectProperty( + (PdfArray) resolveIndirectObject(itemObj), PROP_NAME_RECT)); + + // Name comes from the NM entry and is optional + itemObj = annot.get("NM"); + if (itemObj != null) { + propList.add(new Property(DICT_KEY_NAME, PropertyType.STRING, + _encrypted ? ENCRYPTED + : ((PdfSimpleObject) itemObj).getStringValue())); + } + + // LastModified is optional. The documentation says that + // a PDF date is preferred but not guaranteed. We just + // put it out as a string. + itemObj = annot.get("M"); + if (itemObj != null) { + Literal lastModLit = (Literal) ((PdfSimpleObject) itemObj) + .getToken(); + Property dateProp; + dateProp = new Property(PROP_NAME_LAST_MOD, PropertyType.STRING, + _encrypted ? ENCRYPTED + : lastModLit.getValue()); + + propList.add(dateProp); + } + + // Flags. + itemObj = annot.get("F"); + if (itemObj != null) { + int flagValue = ((PdfSimpleObject) itemObj).getIntValue(); + Property flagProp = (buildBitmaskProperty(flagValue, + PROP_NAME_FLAGS, PdfStrings.ANNOTATIONFLAGS, + PROP_VAL_NO_FLAGS_SET)); + if (flagProp != null) { + propList.add(flagProp); + } + } + + // Appearance dictionary -- just check if it's there. + itemObj = annot.get("AP"); + if (itemObj != null) { + propList.add(new Property(PROP_NAME_APP_DICT, + PropertyType.BOOLEAN, Boolean.TRUE)); + } + + // Action dictionary -- if it's there, set actionsExist + itemObj = annot.get("A"); + if (itemObj != null) { + _actionsExist = true; + itemObj = resolveIndirectObject(itemObj); + // Actions are as common as Destinations for + // connecting to destination pages. If the Action + // is of type GoTo, note its destination. + PdfSimpleObject actionSubtype = (PdfSimpleObject) ((PdfDictionary) itemObj) + .get("S"); + if (actionSubtype == null) { + throw new PdfMalformedException( + MessageConstants.PDF_HUL_120); // PDF-HUL-120 + } + if (ACTION_VAL_GOTO.equals(actionSubtype.getStringValue())) { + PdfObject destObj = ((PdfDictionary) itemObj).get("D"); + if (destObj != null) { + addDestination(destObj, PROP_NAME_ACTION_DEST, propList, + info); + } + } + } + + // Destination object. + itemObj = annot.get(DICT_KEY_DEST); + if (itemObj != null) { + addDestination(itemObj, PROP_NAME_DESTINATION, propList, info); + } + + // Reply Type (RT) (1.6) + itemObj = annot.get("RT"); + if (itemObj instanceof PdfSimpleObject) { + String type = ((PdfSimpleObject) itemObj).getStringValue(); + propList.add(new Property(PROP_NAME_REPLY_TYPE, + PropertyType.STRING, type)); + } + + // Intent (IT) (1.6) + itemObj = annot.get("IT"); + if (itemObj instanceof PdfSimpleObject) { + String type = ((PdfSimpleObject) itemObj).getStringValue(); + propList.add(new Property(PROP_NAME_INTENT, PropertyType.STRING, + type)); + } + + // Callout Line (CL) (1.6) + itemObj = annot.get("CL"); + if (itemObj instanceof PdfArray) { + Vector clData = ((PdfArray) itemObj).getContent(); + // This should be an array of numbers. + Iterator iter = clData.iterator(); + List clList = new ArrayList(6); + while (iter.hasNext()) { + PdfSimpleObject clItem = (PdfSimpleObject) iter.next(); + clList.add(new Double(clItem.getDoubleValue())); + } + propList.add(new Property(PROP_NAME_CALLOUT_LINE, + PropertyType.DOUBLE, PropertyArity.LIST, clList)); + } + + return new Property(PROP_NAME_ANNOTATION, PropertyType.PROPERTY, + PropertyArity.LIST, propList); + } catch (PdfException ee) { + // Just rethrow these + throw ee; + } catch (Exception e) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_121); // PDF-HUL-121 + } + } + + /* + * Given a PdfObject that stands for a Destination, add + * a representative property to the property list. + */ + protected void addDestination(PdfObject itemObj, String propName, + List propList, RepInfo info) { + try { + Destination dest = new Destination(itemObj, this, false); + if (dest.isIndirect()) { + // Encryption messes up name trees + if (!_encrypted) { + int pageObjNum = resolveIndirectDest( + dest.getIndirectDest(), info); + if (pageObjNum == -1) { + // The scope of the reference is outside this + // file, so we just report it as such. + propList.add(new Property(propName, PropertyType.STRING, + PROP_VAL_EXTERNAL)); + } else { + propList.add(new Property(propName, + PropertyType.INTEGER, new Integer(pageObjNum))); + } + } + } else { + if (dest.getPageDest() == null) { + return; // can't get the page object number + } + int pageObjNum = dest.getPageDestObjNumber(); + Integer destPg = _pageSeqMap.get(new Integer(pageObjNum)); + if (destPg != null) { + propList.add(new Property(propName, PropertyType.INTEGER, + destPg)); + } + } + } catch (PdfMalformedException e) { + propList.add(new Property(propName, PropertyType.STRING, PROP_VAL_NULL)); + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + info.setValid(false); + } catch (PdfInvalidException e) { + if (e.getJhoveMessage() != null) { + info.setMessage(new ErrorMessage( + JhoveMessages.getMessageInstance( + e.getJhoveMessage().getId(), e.getJhoveMessage().getMessage(), + e.getJhoveMessage().getSubMessage()))); + } + } catch (Exception e) { + + String msg = e.getClass().getName(); + String msg1 = e.getMessage(); + if (msg1 != null) { + msg = msg + ": " + msg1; + } + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_122.getId(), msg); + propList.add( + new Property(propName, PropertyType.STRING, PROP_VAL_NULL)); + info.setMessage(new ErrorMessage(message, // PDF-HUL-122 + _parser.getOffset())); + info.setValid(false); + } + } + + /* + * Build up a property for one of the kinds of fonts + * in the file. + */ + protected Property buildFontProperty(String name, Map map, int fontType) { + List fontList = new LinkedList(); // list of fonts + Iterator fontIter = map.values().iterator(); + while (fontIter.hasNext()) { + // For each font in the map, build a property for it, + // which consists of a list of scalar properties. Each kind + // of font is spec'ed to have a slightly different set of + // properties, grumble... + PdfDictionary dict = (PdfDictionary) fontIter.next(); + List fontPropList = oneFontPropList(dict, fontType); + Property fProp = new Property(PROP_NAME_FONT, PropertyType.PROPERTY, + PropertyArity.LIST, fontPropList); + fontList.add(fProp); + } + return new Property(name, PropertyType.PROPERTY, PropertyArity.LIST, + fontList); + } + + /* Build the Property list for a given font */ + protected List oneFontPropList(PdfDictionary dict, int fontType) { + List fontPropList = new LinkedList(); + Property prop; + if (fontType == F_TYPE1 || fontType == F_TYPE3 || fontType == F_MM1 + || fontType == F_TT) { + PdfObject tempObj = dict.get(DICT_KEY_NAME); + PdfSimpleObject nameObj = null; + if (tempObj instanceof PdfSimpleObject) { + nameObj = (PdfSimpleObject) tempObj; + } else if (tempObj instanceof PdfIndirectObj) { + nameObj = (PdfSimpleObject) ((PdfIndirectObj) tempObj) + .getObject(); + } + + if (nameObj != null) { + String nameStr = nameObj.getStringValue(); + prop = new Property(DICT_KEY_NAME, PropertyType.STRING, + nameStr); + fontPropList.add(prop); + } + } + + String baseStr = null; + if (fontType != F_TYPE3) { + PdfObject tempObj = dict.get(DICT_KEY_BASE_FONT); + PdfSimpleObject baseFontObj = null; + if (tempObj instanceof PdfSimpleObject) { + baseFontObj = (PdfSimpleObject) tempObj; + } else if (tempObj instanceof PdfIndirectObj) { + baseFontObj = (PdfSimpleObject) ((PdfIndirectObj) tempObj) + .getObject(); + } + + if (baseFontObj != null) { + baseStr = baseFontObj.getStringValue(); + prop = new Property(PROP_NAME_BASE_FONT, PropertyType.STRING, + baseStr); + fontPropList.add(prop); + } + } + + if (fontType == F_CID0 || fontType == F_CID2) { + PdfObject elCid = dict.get(DICT_KEY_CID_INFO); + try { + elCid = resolveIndirectObject(elCid); + } catch (Exception e) { + } + if (elCid instanceof PdfDictionary) { + prop = buildCIDInfoProperty((PdfDictionary) elCid); + fontPropList.add(prop); + } + } + + if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1) { + if (isFontSubset(baseStr)) { + prop = new Property(PROP_NAME_FONT_SUBSET, PropertyType.BOOLEAN, + Boolean.TRUE); + fontPropList.add(prop); + } + } + + if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1 + || fontType == F_TYPE3) { + PdfObject firstCharObj = dict.get(DICT_KEY_FIRST_CHAR); + if (firstCharObj instanceof PdfIndirectObj) { + firstCharObj = ((PdfIndirectObj) firstCharObj).getObject(); + } + try { + int firstChar = ((PdfSimpleObject) firstCharObj).getIntValue(); + prop = new Property(PROP_NAME_FIRST_CHAR, PropertyType.INTEGER, + new Integer(firstChar)); + fontPropList.add(prop); + } catch (Exception e) { + } + + PdfObject lastCharObj = dict.get(DICT_KEY_LAST_CHAR); + if (lastCharObj instanceof PdfIndirectObj) { + lastCharObj = ((PdfIndirectObj) lastCharObj).getObject(); + } + try { + int lastChar = ((PdfSimpleObject) lastCharObj).getIntValue(); + prop = new Property(PROP_NAME_LAST_CHAR, PropertyType.INTEGER, + new Integer(lastChar)); + fontPropList.add(prop); + } catch (Exception e) { + } + } + + if (fontType == F_TYPE3) { + // Put FontBBox and CharProcs into properties + PdfObject bboxObj = dict.get(DICT_KEY_FONT_BBOX); + try { + if (bboxObj instanceof PdfArray) { + fontPropList.add(makeRectProperty((PdfArray) bboxObj, + PROP_VAL_FONT_BBOX)); + } + } catch (Exception e) { + } + + // For CharProcs, we're just checking if it's there. + // (It's required for a Type 3 font.) + // PdfObject charProcs = dict.get("CharProcs"); + // prop = new Property("CharProcs", + // PropertyType.BOOLEAN, + // Boolean.valueOf(charProcs != null)); + // fontPropList.add(prop); + } + + if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1 + || fontType == F_CID0 || fontType == F_CID2) { + PdfObject descriptorObj = dict.get(DICT_KEY_FONT_DESCRIPTOR); + try { + descriptorObj = resolveIndirectObject(descriptorObj); + } catch (Exception e) { + } + if (descriptorObj instanceof PdfDictionary) { + prop = buildFontDescriptorProperty( + (PdfDictionary) descriptorObj); + fontPropList.add(prop); + } + } + + PdfObject encodingObj = dict.get(DICT_KEY_ENCODING); + try { + encodingObj = resolveIndirectObject(encodingObj); + } catch (Exception e) { + } + + if (fontType == F_TYPE0 || fontType == F_TYPE1 || fontType == F_TT + || fontType == F_MM1 || fontType == F_TYPE3) { + // Encoding property -- but only if Encoding is a name + if (encodingObj instanceof PdfSimpleObject) { + prop = new Property(PROP_NAME_ENCODING, PropertyType.STRING, + ((PdfSimpleObject) encodingObj).getStringValue()); + fontPropList.add(prop); + } + } + + if (fontType == F_TYPE1 || fontType == F_TT || fontType == F_MM1 + || fontType == F_TYPE3) { + if (encodingObj != null && encodingObj instanceof PdfDictionary) { + prop = buildEncodingDictProperty((PdfDictionary) encodingObj); + fontPropList.add(prop); + } + } + + if (fontType == F_TYPE0) { + // Encoding is reported as a CMapDictionary property for type 0 + if (encodingObj != null && encodingObj instanceof PdfStream) { + prop = buildCMapDictProperty((PdfStream) encodingObj); + fontPropList.add(prop); + } + } + + if (fontType == F_TYPE3) { + // All we're interested in for Resources is whether + // the dictionary exists + PdfObject rsrc = dict.get(DICT_KEY_RESOURCES); + if (rsrc != null) { + prop = new Property(PROP_NAME_RESOURCES, PropertyType.BOOLEAN, + Boolean.TRUE); + fontPropList.add(prop); + } + } + + if (fontType == F_TYPE0 || fontType == F_TYPE1 || fontType == F_TT + || fontType == F_MM1 || fontType == F_TYPE3) { + PdfObject toUniObj = dict.get(DICT_KEY_TO_UNICODE); + if (toUniObj != null) { + prop = new Property(PROP_NAME_TO_UNICODE, PropertyType.BOOLEAN, + Boolean.TRUE); + fontPropList.add(prop); + } + } + + return fontPropList; + } + + /* + * Code for CMapProperty for Type 0 fonts, based on the Encoding + * entry, broken out of buildFontProperty. + */ + protected Property buildCMapDictProperty(PdfStream encoding) { + PdfDictionary dict = encoding.getDict(); + List propList = new ArrayList(4); + Property prop = new Property(PROP_NAME_CMAP_DICT, PropertyType.PROPERTY, + PropertyArity.LIST, propList); + Property subprop; + + // PdfObject mapName = dict.get ("CMapName"); + + PdfObject cidSysInfo = dict.get(DICT_KEY_CID_INFO); + // We can use buildCIDInfoProperty here to build the subproperty + PdfDictionary cidDict; + List cidList = new LinkedList(); + try { + if (cidSysInfo instanceof PdfDictionary) { + // One CIDInfo dictionary + cidDict = (PdfDictionary) cidSysInfo; + subprop = buildCIDInfoProperty(cidDict); + cidList.add(subprop); + } else if (cidSysInfo instanceof PdfArray) { + // Many CIDInfo dictionaries + Vector v = ((PdfArray) cidSysInfo).getContent(); + for (int i = 0; i < v.size(); i++) { + cidDict = (PdfDictionary) v.elementAt(i); + Property subsubprop = buildCIDInfoProperty(cidDict); + cidList.add(subsubprop); + } + } + } catch (Exception e) { + } + + if (!cidList.isEmpty()) { + subprop = new Property(PROP_NAME_CID_INFOS, PropertyType.PROPERTY, + PropertyArity.LIST, cidList); + propList.add(subprop); + } + + // PdfObject wMod = dict.get("WMode"); + // PdfObject useCMap = dict.get("UseCMap"); + + return prop; + } + + /* + * Code for CIDInfoProperty for CIDFontType0 and CIDFontType2 + * conts. + */ + protected Property buildCIDInfoProperty(PdfDictionary dict) { + List propList = new ArrayList(3); + Property prop = new Property(PROP_NAME_CID_INFO, PropertyType.PROPERTY, + PropertyArity.LIST, propList); + Property subprop; + + // Add the registry identifier + PdfObject reg = dict.get(DICT_KEY_REGISTRY); + if (reg instanceof PdfSimpleObject) { + try { + String regText = ((PdfSimpleObject) reg).getStringValue(); + subprop = new Property(PROP_NAME_REGISTRY, PropertyType.STRING, + _encrypted ? ENCRYPTED : regText); + propList.add(subprop); + } catch (Exception e) { + } + } + + // Add the name of the char collection within the registry + PdfObject order = dict.get(DICT_KEY_ORDERING); + if (reg instanceof PdfSimpleObject) { + try { + String ordText = ((PdfSimpleObject) order).getStringValue(); + subprop = new Property(PROP_NAME_REGISTRY, PropertyType.STRING, + ordText); + propList.add(subprop); + } catch (Exception e) { + } + } + + PdfObject supp = dict.get(DICT_KEY_SUPPLEMENT); + if (supp instanceof PdfSimpleObject) { + try { + int suppvalue = ((PdfSimpleObject) supp).getIntValue(); + subprop = new Property(PROP_NAME_SUPPLEMENT, + PropertyType.INTEGER, new Integer(suppvalue)); + propList.add(subprop); + } catch (Exception e) { + } + } + return prop; + } + + /* + * Code for EncodingDictionary Property for type 1, 3, TrueType, and + * MM fonts. This is based on a dictionary entry with the same name + * as the one for buildCMapDictProperty, but different information. + * Included properties are BaseEncoding and Differences. + */ + protected Property buildEncodingDictProperty(PdfDictionary encodingDict) { + List propList = new ArrayList(2); + Property prop = new Property(PROP_NAME_ENCODING_DICTIONARY, + PropertyType.PROPERTY, PropertyArity.LIST, propList); + PdfObject baseEnc = encodingDict.get(DICT_KEY_BASE_ENCODING); + if (baseEnc instanceof PdfSimpleObject) { + String baseEncString = ((PdfSimpleObject) baseEnc).getStringValue(); + if (baseEncString != null) { + Property baseEncProp = new Property(PROP_NAME_BASE_ENCODING, + PropertyType.STRING, baseEncString); + propList.add(baseEncProp); + } + } + + PdfObject diffs = encodingDict.get(DICT_KEY_DIFFERENCES); + Property diffsProp = new Property(PROP_NAME_DIFFERENCES, + PropertyType.BOOLEAN, Boolean.valueOf(diffs != null)); + propList.add(diffsProp); + + return prop; + } + + /* + * Separated-out code for FontDescriptor property. This + * is a list of six Properies: FontName, Flags, + * FontBBox, FontFile, FontFile2, and FontFile3. + */ + protected Property buildFontDescriptorProperty(PdfDictionary encodingDict) { + List propList = new ArrayList(6); + Property prop = new Property(PROP_NAME_FONT_DESC, PropertyType.PROPERTY, + PropertyArity.LIST, propList); + Property subprop; + try { + PdfSimpleObject fName = (PdfSimpleObject) encodingDict + .get(DICT_KEY_FONT_NAME); + String fNameStr = fName.getStringValue(); + subprop = new Property(PROP_NAME_FONT_NAME, PropertyType.STRING, + fNameStr); + propList.add(subprop); + } catch (Exception e) { + } + + try { + PdfSimpleObject flags = (PdfSimpleObject) encodingDict + .get(DICT_KEY_FLAGS); + int flagValue = flags.getIntValue(); + subprop = buildBitmaskProperty(flagValue, PROP_NAME_FLAGS, + PdfStrings.FONTDESCFLAGS, PROP_VAL_NO_FLAGS_SET); + if (subprop != null) { + propList.add(subprop); + } + } catch (Exception e) { + } + + try { + PdfArray bboxObj = (PdfArray) encodingDict.get(DICT_KEY_FONT_BBOX); + double[] bbox = bboxObj.toRectangle(); + // toRectangle is written to return an array of double, + // which is what the bounding box is in the most general + // case; but the spec requires an array of integer, so + // we convert is. This may seem like an excess of work, + // but I'd rather have toRectangle do the right thing + // rather than losing generality. + if (bbox != null) { + int[] ibbox = new int[4]; + for (int i = 0; i < 4; i++) { + ibbox[i] = (int) bbox[i]; + } + subprop = new Property(PROP_NAME_FONT_BBOX, + PropertyType.INTEGER, PropertyArity.ARRAY, ibbox); + propList.add(subprop); + } + } catch (Exception e) { + } + + PdfObject fontFile = encodingDict.get(DICT_KEY_FONT_FILE); + if (fontFile != null) { + // All we care about is whether it exists or not + subprop = new Property(PROP_NAME_FONT_FILE, PropertyType.BOOLEAN, + Boolean.TRUE); + propList.add(subprop); + } + fontFile = encodingDict.get(DICT_KEY_FONT_FILE_2); + if (fontFile != null) { + subprop = new Property(PROP_NAME_FONT_FILE_2, PropertyType.BOOLEAN, + Boolean.TRUE); + propList.add(subprop); + } + fontFile = encodingDict.get(DICT_KEY_FONT_FILE_3); + if (fontFile != null) { + subprop = new Property(PROP_NAME_FONT_FILE_3, PropertyType.BOOLEAN, + Boolean.TRUE); + propList.add(subprop); + } + return prop; + } + + protected Property buildViewPrefProperty(PdfDictionary prefDict) { + Property p; + PdfObject ob; + boolean b; + String s; + List propList = new ArrayList(12); + Property prop = new Property(DICT_KEY_VIEWER_PREFS, + PropertyType.PROPERTY, PropertyArity.LIST, propList); + + ob = prefDict.get(DICT_KEY_HIDE_TOOLBAR); + if (ob instanceof PdfSimpleObject) { + b = ((PdfSimpleObject) ob).isTrue(); + } else { + b = false; + } + p = new Property(PROP_NAME_HIDE_TOOLBAR, PropertyType.BOOLEAN, + Boolean.valueOf(b)); + propList.add(p); + + ob = prefDict.get(DICT_KEY_HIDE_MENUBAR); + if (ob instanceof PdfSimpleObject) { + b = ((PdfSimpleObject) ob).isTrue(); + } else { + b = false; + } + p = new Property(PROP_NAME_HIDE_MENUBAR, PropertyType.BOOLEAN, + Boolean.valueOf(b)); + propList.add(p); + + ob = prefDict.get(DICT_KEY_HIDE_WINDOW_UI); + if (ob instanceof PdfSimpleObject) { + b = ((PdfSimpleObject) ob).isTrue(); + } else { + b = false; + } + p = new Property(PROP_NAME_HIDE_WINDOW_UI, PropertyType.BOOLEAN, + Boolean.valueOf(b)); + propList.add(p); + + ob = prefDict.get(DICT_KEY_FIT_WINDOW); + if (ob instanceof PdfSimpleObject) { + b = ((PdfSimpleObject) ob).isTrue(); + } else { + b = false; + } + p = new Property(PROP_NAME_FIT_WINDOW, PropertyType.BOOLEAN, + Boolean.valueOf(b)); + propList.add(p); + + ob = prefDict.get(DICT_KEY_CENTER_WINDOW); + if (ob instanceof PdfSimpleObject) { + b = ((PdfSimpleObject) ob).isTrue(); + } else { + b = false; + } + p = new Property(PROP_NAME_CENTER_WINDOW, PropertyType.BOOLEAN, + Boolean.valueOf(b)); + propList.add(p); + + ob = prefDict.get(DICT_KEY_DISP_DOC_TITLE); + if (ob instanceof PdfSimpleObject) { + b = ((PdfSimpleObject) ob).isTrue(); + } else { + b = false; + } + p = new Property(PROP_NAME_DISP_DOC_TITLE, PropertyType.BOOLEAN, + Boolean.valueOf(b)); + propList.add(p); + + ob = prefDict.get(DICT_KEY_NO_FULL_PAGE); + if (ob instanceof PdfSimpleObject) { + s = ((PdfSimpleObject) ob).getStringValue(); + } else + s = DEFAULT_MODE; + p = new Property(PROP_NAME_NO_FULL_PAGE, PropertyType.STRING, s); + propList.add(p); + + ob = prefDict.get(DICT_KEY_DIRECTION); + if (ob instanceof PdfSimpleObject) { + s = ((PdfSimpleObject) ob).getStringValue(); + } else + s = "L2R"; + p = new Property(PROP_NAME_DIRECTION, PropertyType.STRING, s); + propList.add(p); + + ob = prefDict.get(DICT_KEY_VIEW_AREA); + if (ob instanceof PdfSimpleObject) { + s = ((PdfSimpleObject) ob).getStringValue(); + } else + s = PROP_VAL_CROP_BOX; + p = new Property(PROP_NAME_VIEW_AREA, PropertyType.STRING, s); + propList.add(p); + + ob = prefDict.get(DICT_KEY_VIEW_CLIP); + if (ob instanceof PdfSimpleObject) { + s = ((PdfSimpleObject) ob).getStringValue(); + } else + s = PROP_VAL_CROP_BOX; + p = new Property(PROP_NAME_VIEW_CLIP, PropertyType.STRING, s); + propList.add(p); + + ob = prefDict.get(DICT_KEY_PRINT_AREA); + if (ob instanceof PdfSimpleObject) { + s = ((PdfSimpleObject) ob).getStringValue(); + } else + s = PROP_VAL_CROP_BOX; + p = new Property(PROP_NAME_PRINT_AREA, PropertyType.STRING, s); + propList.add(p); + + ob = prefDict.get(DICT_KEY_PAGE_CLIP); + if (ob instanceof PdfSimpleObject) { + s = ((PdfSimpleObject) ob).getStringValue(); + } else + s = PROP_VAL_CROP_BOX; + p = new Property(PROP_NAME_PAGE_CLIP, PropertyType.STRING, s); + propList.add(p); + return prop; + } + + /* + * Return TRUE if the string is a font subset string, which begins + * with six uppercase letters and then a plus sign + */ + protected boolean isFontSubset(String baseStr) { + if (baseStr == null || baseStr.length() < 7) { + return false; + } + for (int i = 0; i < 6; i++) { + char ch = baseStr.charAt(i); + if (!Character.isUpperCase(ch)) { + return false; + } + } + return (baseStr.charAt(6) == '+'); + } + + /* + * Create the "Outlines" property from the Outlines item in the + * catalog dictionary. As a side effect, we set the actionsExist + * flag if any Actions are found. Because we check destinations, + * this can't be called till the page tree is built. + * + * Outlines can be recursive, according to Adobe people, so we have + * to track visited nodes. + */ + protected Property buildOutlinesProperty(PdfDictionary dict, RepInfo info) + throws PdfException { + _recursionWarned = false; + _visitedOutlineNodes = new HashSet(); + List itemList = new LinkedList(); + Property prop = new Property(PROP_NAME_OUTLINES, PropertyType.PROPERTY, + PropertyArity.LIST, itemList); + try { + PdfObject item = resolveIndirectObject(dict.get(DICT_KEY_FIRST)); + // In PDF 1.4, "First" and "Last" are unconditionally required. + // However, + // in 1.6, they can be omitted if there are no open or closed + // outline items. + // Strictly speaking, we should do several additional checks, but + // letting the + // outline go as empty seems sufficient. + // if (item == null || !(item instanceof PdfDictionary)) { + // throw new PdfInvalidException("Outline dictionary missing + // required entry"); + // } + int listCount = 0; // Guard against looping + while (item != null) { + Integer onum = new Integer(item.getObjNumber()); + Property p = buildOutlineItemProperty((PdfDictionary) item, + info); + itemList.add(p); + item = resolveIndirectObject( + ((PdfDictionary) item).get(DICT_KEY_NEXT)); + if (item == null) { + break; + } + // Check if this object is its own sibling. (It really does + // happen!) + if (item.getObjNumber() == onum.intValue()) { + if (!_recursionWarned) { + info.setMessage( + new InfoMessage(MessageConstants.PDF_HUL_123)); // PDF-HUL-123 + _recursionWarned = true; + } + break; + } + if (++listCount > 2000) { + break; + } + } + } catch (PdfException e1) { + throw e1; + } catch (Exception e) { + throw new PdfMalformedException(MessageConstants.PDF_HUL_124); // PDF-HUL-124 + } + if (itemList.isEmpty()) { + return null; + } + return prop; + } + + /* + * Create an item property within the outlines hierarchy. If an + * Outline item property has children, then there is a list + * property called "Children" with elements called "Item". + * It calls itself recursively to walk down the outline. + */ + protected Property buildOutlineItemProperty(PdfDictionary dict, + RepInfo info) throws PdfException { + List itemList = new ArrayList(3); + try { + Property prop = new Property(PROP_NAME_ITEM, PropertyType.PROPERTY, + PropertyArity.LIST, itemList); + PdfSimpleObject title = (PdfSimpleObject) resolveIndirectObject( + dict.get(DICT_KEY_TITLE)); + if (title == null) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_125); // PDF-HUL-125 + } + itemList.add(new Property(PROP_NAME_TITLE, PropertyType.STRING, + _encrypted ? ENCRYPTED : title.getStringValue())); + + // Check other required stuff + if (dict.get(DICT_KEY_PARENT) == null) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_126); // PDF-HUL-126 + } + PdfObject cnt = dict.get(DICT_KEY_COUNT); + if (cnt != null && (!(cnt instanceof PdfSimpleObject) + || !(((PdfSimpleObject) cnt) + .getToken() instanceof Numeric))) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_127); // PDF-HUL-127 + } + // The entries for Prev, Next, First, and Last must + // all be indirect references or absent. Just cast them to + // throw an exception if they're something else + @SuppressWarnings("unused") + PdfIndirectObj ob = (PdfIndirectObj) dict.get(DICT_KEY_PREV); + ob = (PdfIndirectObj) dict.get(DICT_KEY_NEXT); + ob = (PdfIndirectObj) dict.get(DICT_KEY_FIRST); + ob = (PdfIndirectObj) dict.get(DICT_KEY_LAST); + + // Check if there are Actions in the outline. This saves going + // through the outlines all over again if a Profile checker + // needs to know this. We flag only the existence of one or more + // Actions + // in the document. + if (dict.get("A") != null) { + _actionsExist = true; + } + + PdfObject destObj = dict.get(DICT_KEY_DEST); + if (destObj != null) { + destObj = resolveIndirectObject(destObj); + Destination dest = new Destination(destObj, this, false); + if (dest.isIndirect()) { + itemList.add(new Property(PROP_NAME_DESTINATION, + PropertyType.STRING, dest.getIndirectDest().getStringValue())); + } else { + int pageObjNum = dest.getPageDestObjNumber(); + Integer destPg = _pageSeqMap.get(new Integer(pageObjNum)); + if (destPg != null) { + itemList.add(new Property(PROP_NAME_DESTINATION, + PropertyType.INTEGER, destPg)); + } + } + } + + PdfDictionary child = (PdfDictionary) resolveIndirectObject( + dict.get(DICT_KEY_FIRST)); + if (child != null) { + List childList = new LinkedList(); + Property childProp = new Property(PROP_NAME_CHILDREN, + PropertyType.PROPERTY, PropertyArity.LIST, childList); + // We aren't catching all possible combinations of looping. Put + // a maximum + // on the list just to be safe. + int listCount = 0; + while (child != null) { + Integer onum = new Integer(child.getObjNumber()); + if (_visitedOutlineNodes.contains(onum)) { + /* We have recursion! */ + if (!_recursionWarned) { + // Warn of recursion + info.setMessage(new InfoMessage( + MessageConstants.PDF_HUL_128)); // PDF-HUL-128 + _recursionWarned = true; + } + } else { + _visitedOutlineNodes.add(onum); + Property p = buildOutlineItemProperty(child, info); + childList.add(p); + } + child = (PdfDictionary) resolveIndirectObject( + child.get(DICT_KEY_NEXT)); + if (child == null) { + break; + } + // Check if this object is its own sibling. (It really does + // happen!) + if (child.getObjNumber() == onum.intValue()) { + if (!_recursionWarned) { + info.setMessage(new InfoMessage( + MessageConstants.PDF_HUL_129)); // PDF-HUL-129 + _recursionWarned = true; + } + break; + } + if (++listCount > 2000) + break; // safety check + } + itemList.add(childProp); + } + return prop; + } catch (PdfException pe) { + throw pe; + } catch (ClassCastException ce) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_130); // PDF-HUL-130 + } catch (Exception e) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_131); // PDF-HUL-131 + } + } + + /* + * This is separated out from readDocCatalogDict, where it + * would otherwise make sense, because we can't build + * the outlines property till we have a page tree to + * locate destinations. + */ + protected boolean doOutlineStuff(RepInfo info) { + if (_outlineDict != null) { + try { + Property oprop = buildOutlinesProperty(_outlineDict, info); + if (_showOutlines || _verbosity == Module.MAXIMUM_VERBOSITY) { + if (oprop != null) { + _docCatalogList.add(oprop); + } + } else if (!_skippedOutlinesReported) { + // We report that we aren't reporting skipped outlines + info.setMessage( + new InfoMessage(MessageConstants.PDF_HUL_132)); // PDF-HUL-132 + _skippedOutlinesReported = true; + } + } catch (PdfException e) { + info.setMessage(new ErrorMessage(e.getJhoveMessage(), _parser.getOffset())); + e.disparage(info); + // If it's just invalid, we can keep going + return (e instanceof PdfInvalidException); + } + } + return true; + } + + /* + * Given a PdfSimpleObject representing a key, + * look up the Destination which it references. + * There are two completely different ways this can be done, + * though any given PDF file is supposed to implement only one. + * If _destsDict is non-null, we look the string up there, and + * may find either a dictionary or an array. Otherwise + * if _destNames is non-null, it's a NameTreeNode which contains + * the mapping. In either case, the destination could be + * external, in which case we just return a string saying so. + * (The implementation of Destinations in PDF is a prime example + * of design by stone soup.) + * We return the page sequence number for the referenced page. + * If we can't find a match for the reference, we return -1. + */ + protected int resolveIndirectDest(PdfSimpleObject key, RepInfo info) throws PdfException { + if (key == null) { + throw new IllegalArgumentException("Argument key can not be null"); + } + _logger.finest("Looking for indirectly referenced Dest: " + + key.getStringValue()); + if (_destNames == null) + return -1; + PdfObject destObj = _destNames.get(key.getRawBytes()); + // Was the Dest this annotation refers to found in the document? + if (destObj == null) { + // Treat this condition as invalid: + String mess = MessageFormat.format( + MessageConstants.PDF_HUL_149.getMessage(), + key.getStringValue()); + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.PDF_HUL_149.getId(), mess); + info.setMessage(new ErrorMessage(message)); + throw new PdfInvalidException(message); // PDF-HUL-149 + // OR if this is not considered invalid + // return -1; + } + Destination dest = new Destination(destObj, this, true); + return dest.getPageDestObjNumber(); + } + + /* Build the user permission property., */ + protected Property buildUserPermProperty(int flags, String[] flagStrs) { + return buildBitmaskProperty(flags, "UserAccess", flagStrs, + "No permissions"); + } + + /** + * Add a string property, based on a dictionary entry + * with a string value, to a specified List. + */ + protected void addStringProperty(PdfDictionary dict, + List propList, String key, String propName) { + String propText = null; + PdfObject propObject = dict.get(key); + if (propObject instanceof PdfSimpleObject) { + Token tok = ((PdfSimpleObject) propObject).getToken(); + if (tok instanceof Literal) { + if (_encrypted) { + propText = ENCRYPTED; + } else { + propText = ((Literal) tok).getValue(); + } + propList.add( new Property(propName, PropertyType.STRING, (propText == null) ? "" : propText)); - } - } - } - - /** - * Add a date property, based on a dictionary entry - * with a string value, to a specified List. - */ - protected void addDateProperty(PdfDictionary dict, List propList, - String key, String propName) throws PdfException { - if (_encrypted) { - String propText = ENCRYPTED; - propList.add(new Property(propName, PropertyType.STRING, propText)); - } else { - PdfObject propObject = dict.get(key); - if (propObject instanceof PdfSimpleObject) { - Token tok = ((PdfSimpleObject) propObject).getToken(); - if (tok instanceof Literal) { - Literal lit = (Literal) tok; - Date propDate = lit.parseDate(); - if (propDate != null) { - propList.add(new Property(propName, PropertyType.DATE, propDate)); - // Ignore empty literals as this isn't an error - } else if (!lit.getValue().isEmpty()) { - throw new PdfInvalidException(MessageConstants.PDF_HUL_133, 0); // PDF-HUL-133 - } - } - } - } - } - - /* - * General function for adding a property with a 32-bit - * value, with an array of Strings to interpret - * the value as a bitmask. - */ - protected Property buildBitmaskProperty(int val, String name, - String[] valueNames, String defaultStr) { - if (_je != null && _je.getShowRawFlag()) { - return new Property(name, PropertyType.INTEGER, new Integer(val)); - } - List slist = new LinkedList(); - try { - for (int i = 0; i < valueNames.length; i++) { - if ((val & (1 << i)) != 0 && valueNames[i].length() > 0) { - slist.add(valueNames[i]); - } - } - // Provision for a default string if the property - // would otherwise have an empty list - if (slist.isEmpty() && defaultStr != null) { - slist.add(defaultStr); - } - } catch (Exception e) { - return null; - } - return new Property(name, PropertyType.STRING, PropertyArity.LIST, - slist); - } - - /* - * Take a PdfArray which is supposed to conform to the rectangle - * description (i.e., it's an array of 4 numbers) and create - * a Property which is an array of 4 integers. - */ - protected Property makeRectProperty(PdfArray arrObj, String name) { - int[] iarr = new int[4]; - double[] arr = arrObj.toRectangle(); - // toRectangle is written to return an array of double, - // which is what the bounding box is in the most general - // case; but the spec requires an array of integer, so - // we convert it. This may seem like an excess of work, - // but I'd rather have toRectangle do the right thing - // rather than losing generality. - for (int i = 0; i < 4; i++) { - iarr[i] = (int) arr[i]; - } - return new Property(name, PropertyType.INTEGER, PropertyArity.ARRAY, - iarr); - } - - private static boolean checkTypeKey(final PdfDictionary dict, - final RepInfo info, final String expctVal, - final JhoveMessage typeInvalMess, - final JhoveMessage typeNotFoundMess, - final JhoveMessage typeNotSimpleMess) { - // Get the type key from the dictionary - PdfObject typeObj = dict.get(DICT_KEY_TYPE); - if (typeObj != null && typeObj instanceof PdfSimpleObject) { - // If the type key is not null and is a simple object - String typeValue = ((PdfSimpleObject) typeObj).getStringValue(); - if (!expctVal.equals(typeValue)) { - // If the type key value is not of the expected value - info.setWellFormed(false); - info.setMessage(new ErrorMessage(typeInvalMess, 0)); - return false; - } - } else { - // There's no type key or it's not a simple object - // Choose message depending on whether the value is null or of - // the wrong type - JhoveMessage message = (typeObj == null) ? typeNotFoundMess - : typeNotSimpleMess; - info.setMessage(new ErrorMessage(message, 0)); - info.setWellFormed(false); - return false; - } - return true; - } - - private static String imageMimeFromFilters(Filter[] filters) { - // If there's no filters it's a PNG - if (filters == null || filters.length == 0) { - return "image/png"; - } - // Iterate the filter list - for (Filter filt : filters) { - // Get the Filter name - String filterName = filt.getFilterName(); - // And the MIME type from htat - String mime = imageMimeFromFilterName(filterName); - if (mime != null) { - // If it's not null then return - return mime; - } - // Next filter - } - // No MIME type match made for filter list - return null; - } - - // Stolen from an Apache PDF Box method: - // https://github.com/apache/pdfbox/blob/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.java#L767 - private static String imageMimeFromFilterName(final String filterName) { - if (FILTER_NAME_DCT.equals(filterName)) { - // DCTDecode is JPEG - return "image/jpg"; - } else if (FILTER_NAME_JPX.equals(filterName)) { - // JPX Decode for JPX (JP2K) - return "image/jpx"; - } else if (FILTER_NAME_CCITT.equals(filterName)) { - // CCITT is a TIFF image - return "image/tiff"; - } else if (FILTER_NAME_FLATE.equals(filterName) - || FILTER_NAME_LZW.equals(filterName) - || FILTER_NAME_RUN_LENGTH.equals(filterName)) { - // There's a bunch of PNG possibilities - return "image/png"; - } - // No match made - return null; - } - - private PdfObject getObjectFromStream(final int objIndex, - final int recGuard) throws PdfMalformedException { - /* - * The object is located in an object stream. Need to get the - * object stream first. - * Be cautious dealing with _cachedStreamIndex and _cachedObjectStream; - * these can be modified by a recursive call to getObject. - */ - try { - int objStreamIndex = _xref2[objIndex][0]; - PdfObject streamObj; - ObjectStream ostrm = null; - if (!_streamsEncrypted) { - if (objStreamIndex == _cachedStreamIndex) { - ostrm = _cachedObjectStream; - // Reset it - if (ostrm.isValid()) { - ostrm.readIndex(); - } - } else { - streamObj = resolveIndirectObject( - getObject(objStreamIndex, recGuard - 1)); - if (streamObj instanceof PdfStream) { - ostrm = new ObjectStream((PdfStream) streamObj, _raf); - if (ostrm.isValid()) { - ostrm.readIndex(); - _cachedObjectStream = ostrm; - _cachedStreamIndex = objStreamIndex; - } else { - throw new PdfMalformedException( - MessageConstants.PDF_HUL_108); // PDF-HUL-108 - } - } - } - /* And finally extract the object from the object stream. */ - return ostrm.getObject(objIndex); - }else { - return null; - } - - } catch (ZipException excep) { - _logger.info(excep.getMessage()); - throw new PdfMalformedException(MessageConstants.PDF_HUL_109); // PDF-HUL-109 - } catch (Exception e) { - _logger.info(e.getMessage()); - /* Fall through with error */ - } - throw new PdfMalformedException(MessageConstants.PDF_HUL_110); // PDF-HUL-110 - } + } + } + } + + /** + * Add a date property, based on a dictionary entry + * with a string value, to a specified List. + */ + protected void addDateProperty(PdfDictionary dict, List propList, + String key, String propName) throws PdfException { + if (_encrypted) { + String propText = ENCRYPTED; + propList.add(new Property(propName, PropertyType.STRING, propText)); + } else { + PdfObject propObject = dict.get(key); + if (propObject instanceof PdfSimpleObject) { + Token tok = ((PdfSimpleObject) propObject).getToken(); + if (tok instanceof Literal) { + Literal lit = (Literal) tok; + Date propDate = lit.parseDate(); + if (propDate != null) { + propList.add(new Property(propName, PropertyType.DATE, propDate)); + // Ignore empty literals as this isn't an error + } else if (!lit.getValue().isEmpty()) { + throw new PdfInvalidException(MessageConstants.PDF_HUL_133, 0); // PDF-HUL-133 + } + } + } + } + } + + /* + * General function for adding a property with a 32-bit + * value, with an array of Strings to interpret + * the value as a bitmask. + */ + protected Property buildBitmaskProperty(int val, String name, + String[] valueNames, String defaultStr) { + if (_je != null && _je.getShowRawFlag()) { + return new Property(name, PropertyType.INTEGER, new Integer(val)); + } + List slist = new LinkedList(); + try { + for (int i = 0; i < valueNames.length; i++) { + if ((val & (1 << i)) != 0 && valueNames[i].length() > 0) { + slist.add(valueNames[i]); + } + } + // Provision for a default string if the property + // would otherwise have an empty list + if (slist.isEmpty() && defaultStr != null) { + slist.add(defaultStr); + } + } catch (Exception e) { + return null; + } + return new Property(name, PropertyType.STRING, PropertyArity.LIST, + slist); + } + + /* + * Take a PdfArray which is supposed to conform to the rectangle + * description (i.e., it's an array of 4 numbers) and create + * a Property which is an array of 4 integers. + */ + protected Property makeRectProperty(PdfArray arrObj, String name) { + int[] iarr = new int[4]; + double[] arr = arrObj.toRectangle(); + // toRectangle is written to return an array of double, + // which is what the bounding box is in the most general + // case; but the spec requires an array of integer, so + // we convert it. This may seem like an excess of work, + // but I'd rather have toRectangle do the right thing + // rather than losing generality. + for (int i = 0; i < 4; i++) { + iarr[i] = (int) arr[i]; + } + return new Property(name, PropertyType.INTEGER, PropertyArity.ARRAY, + iarr); + } + + private static boolean checkTypeKey(final PdfDictionary dict, + final RepInfo info, final String expctVal, + final JhoveMessage typeInvalMess, + final JhoveMessage typeNotFoundMess, + final JhoveMessage typeNotSimpleMess) { + // Get the type key from the dictionary + PdfObject typeObj = dict.get(DICT_KEY_TYPE); + if (typeObj != null && typeObj instanceof PdfSimpleObject) { + // If the type key is not null and is a simple object + String typeValue = ((PdfSimpleObject) typeObj).getStringValue(); + if (!expctVal.equals(typeValue)) { + // If the type key value is not of the expected value + info.setWellFormed(false); + info.setMessage(new ErrorMessage(typeInvalMess, 0)); + return false; + } + } else { + // There's no type key or it's not a simple object + // Choose message depending on whether the value is null or of + // the wrong type + JhoveMessage message = (typeObj == null) ? typeNotFoundMess + : typeNotSimpleMess; + info.setMessage(new ErrorMessage(message, 0)); + info.setWellFormed(false); + return false; + } + return true; + } + + private static String imageMimeFromFilters(Filter[] filters) { + // If there's no filters it's a PNG + if (filters == null || filters.length == 0) { + return "image/png"; + } + // Iterate the filter list + for (Filter filt : filters) { + // Get the Filter name + String filterName = filt.getFilterName(); + // And the MIME type from htat + String mime = imageMimeFromFilterName(filterName); + if (mime != null) { + // If it's not null then return + return mime; + } + // Next filter + } + // No MIME type match made for filter list + return null; + } + + // Stolen from an Apache PDF Box method: + // https://github.com/apache/pdfbox/blob/2.0/pdfbox/src/main/java/org/apache/pdfbox/pdmodel/graphics/image/PDImageXObject.java#L767 + private static String imageMimeFromFilterName(final String filterName) { + if (FILTER_NAME_DCT.equals(filterName)) { + // DCTDecode is JPEG + return "image/jpg"; + } else if (FILTER_NAME_JPX.equals(filterName)) { + // JPX Decode for JPX (JP2K) + return "image/jpx"; + } else if (FILTER_NAME_CCITT.equals(filterName)) { + // CCITT is a TIFF image + return "image/tiff"; + } else if (FILTER_NAME_FLATE.equals(filterName) + || FILTER_NAME_LZW.equals(filterName) + || FILTER_NAME_RUN_LENGTH.equals(filterName)) { + // There's a bunch of PNG possibilities + return "image/png"; + } + // No match made + return null; + } + + private PdfObject getObjectFromStream(final int objIndex, + final int recGuard) throws PdfMalformedException { + /* + * The object is located in an object stream. Need to get the + * object stream first. + * Be cautious dealing with _cachedStreamIndex and _cachedObjectStream; + * these can be modified by a recursive call to getObject. + */ + try { + int objStreamIndex = _xref2[objIndex][0]; + PdfObject streamObj; + ObjectStream ostrm = null; + if (!_streamsEncrypted) { + if (objStreamIndex == _cachedStreamIndex) { + ostrm = _cachedObjectStream; + // Reset it + if (ostrm.isValid()) { + ostrm.readIndex(); + } + } else { + streamObj = resolveIndirectObject( + getObject(objStreamIndex, recGuard - 1)); + if (streamObj instanceof PdfStream) { + ostrm = new ObjectStream((PdfStream) streamObj, _raf); + if (ostrm.isValid()) { + ostrm.readIndex(); + _cachedObjectStream = ostrm; + _cachedStreamIndex = objStreamIndex; + } else { + throw new PdfMalformedException( + MessageConstants.PDF_HUL_108); // PDF-HUL-108 + } + } + } + /* And finally extract the object from the object stream. */ + return ostrm.getObject(objIndex); + } else { + return null; + } + + } catch (ZipException excep) { + _logger.info(excep.getMessage()); + throw new PdfMalformedException(MessageConstants.PDF_HUL_109); // PDF-HUL-109 + } catch (Exception e) { + _logger.info(e.getMessage()); + /* Fall through with error */ + } + throw new PdfMalformedException(MessageConstants.PDF_HUL_110); // PDF-HUL-110 + } } diff --git a/jhove-modules/pom.xml b/jhove-modules/pom.xml index b029bda9c..7f7a99480 100644 --- a/jhove-modules/pom.xml +++ b/jhove-modules/pom.xml @@ -5,7 +5,7 @@ org.openpreservation.jhove jhove - 1.29.0-SNAPSHOT + 1.30.0-RC1 org.openpreservation.jhove.modules @@ -18,7 +18,7 @@ org.openpreservation.jhove jhove-core - 1.29.0-SNAPSHOT + 1.30.0-RC1 org.junit.vintage diff --git a/jhove-modules/tiff-hul/pom.xml b/jhove-modules/tiff-hul/pom.xml index 02d34c814..7a335a740 100644 --- a/jhove-modules/tiff-hul/pom.xml +++ b/jhove-modules/tiff-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 tiff-hul 1.9.4 diff --git a/jhove-modules/utf8-hul/pom.xml b/jhove-modules/utf8-hul/pom.xml index dd540ade0..f75d052cd 100644 --- a/jhove-modules/utf8-hul/pom.xml +++ b/jhove-modules/utf8-hul/pom.xml @@ -3,7 +3,7 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 utf8-hul 1.7.3 diff --git a/jhove-modules/wave-hul/pom.xml b/jhove-modules/wave-hul/pom.xml index 02eb1c05e..dd20e8626 100644 --- a/jhove-modules/wave-hul/pom.xml +++ b/jhove-modules/wave-hul/pom.xml @@ -3,10 +3,10 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 wave-hul - 1.8.2 + 1.8.3 JHOVE WAVE Module HUL WAVE module developed by Harvard University Library diff --git a/jhove-modules/wave-hul/src/main/java/edu/harvard/hul/ois/jhove/module/WaveModule.java b/jhove-modules/wave-hul/src/main/java/edu/harvard/hul/ois/jhove/module/WaveModule.java index e58bb691c..10ffa2b9d 100644 --- a/jhove-modules/wave-hul/src/main/java/edu/harvard/hul/ois/jhove/module/WaveModule.java +++ b/jhove-modules/wave-hul/src/main/java/edu/harvard/hul/ois/jhove/module/WaveModule.java @@ -71,8 +71,8 @@ public class WaveModule extends ModuleBase { /* Module metadata */ private static final String NAME = "WAVE-hul"; - private static final String RELEASE = "1.8.2"; - private static final int [] DATE = { 2022, 04, 22 }; + private static final String RELEASE = "1.8.3"; + private static final int [] DATE = { 2024, 03, 05 }; private static final String[] FORMATS = { "WAVE", "Audio for Windows", "EBU Technical Specification 3285", "Broadcast Wave Format", "BWF", "EBU Technical Specification 3306", "RF64" }; diff --git a/jhove-modules/xml-hul/pom.xml b/jhove-modules/xml-hul/pom.xml index 6268e2f6a..7de1e49db 100644 --- a/jhove-modules/xml-hul/pom.xml +++ b/jhove-modules/xml-hul/pom.xml @@ -3,10 +3,10 @@ org.openpreservation.jhove.modules jhove-modules - 1.29.0-SNAPSHOT + 1.30.0-RC1 xml-hul - 1.5.3 + 1.5.4 JHOVE XML Module HUL XML module developed by Harvard University Library diff --git a/jhove-modules/xml-hul/src/main/java/edu/harvard/hul/ois/jhove/module/XmlModule.java b/jhove-modules/xml-hul/src/main/java/edu/harvard/hul/ois/jhove/module/XmlModule.java index 6c067232a..2e6669e97 100644 --- a/jhove-modules/xml-hul/src/main/java/edu/harvard/hul/ois/jhove/module/XmlModule.java +++ b/jhove-modules/xml-hul/src/main/java/edu/harvard/hul/ois/jhove/module/XmlModule.java @@ -48,982 +48,982 @@ */ public class XmlModule extends ModuleBase { - private static final String NAME = "XML-hul"; - private static final String RELEASE = "1.5.3"; - private static final int[] DATE = { 2023, 03, 16 }; - private static final String[] FORMAT = { "XML", "XHTML" }; - private static final String COVERAGE = "XML 1.0"; - private static final String[] MIMETYPE = { "text/xml", "application/xml", - "text/html" }; - private static final String WELLFORMED = "An XML file is well-formed if " - + "it meets the criteria defined in Section 2.1 of the XML " - + "specification (W3C Recommendation, 3rd edition, 2004-02-04)"; - private static final String VALIDITY = "An XML file is valid if " - + "well-formed, and the file has an associated DTD or XML Schema and " - + "the file meets the constraints defined by that DTD or Schema"; - private static final String REPINFO = "Additional representation " - + "information includes: version, endcoding, standalone flag, DTD or " - + "schema, namespaces, notations, character references, entities, " - + "processing instructions, and comments"; - private static final String NOTE = "This module determines " - + "well-formedness and validity using the SAX2-conforming parser " - + "specified by the invoking application"; - private static final String RIGHTS = "Copyright 2004-2007 by JSTOR and " - + "the President and Fellows of Harvard College. " - + "Released under the GNU Lesser General Public License."; - - /** Top-level property list. */ - protected List _propList; - - /** Top-level property. */ - protected Property _metadata; - - /** Doctype for XHTML documents only, otherwise null. */ - protected String _xhtmlDoctype; - - /** Base URL for DTDs. If null, all DTD URLs are absolute. */ - protected String _baseURL; - - /** - * Flag to control signature checking behavior. If true, - * checkSignatures insists on an XML document declaration; if - * false, it will parse the file if there is no document - * declaration. - */ - protected boolean _sigWantsDecl; - - /** - * Flag to indicate we're invoking the parser from checkSignatures. - * When true, it's up to checkSignatures to mark a signature as present. - */ - protected boolean _parseFromSig; - - /** Flag to indicate if TextMD metadata should be reported. */ - protected boolean _withTextMD; - - /** TextMD metadata for the file being processed. */ - protected TextMDMetadata _textMD; - - /** Map of URLs to locally stored schemas. */ - protected Map _localSchemas; - - /** - * Class constructor. - * - * Instantiate an XmlModule object. - */ - public XmlModule() { - super(NAME, RELEASE, DATE, FORMAT, COVERAGE, MIMETYPE, WELLFORMED, - VALIDITY, REPINFO, NOTE, RIGHTS, false); - - _vendor = Agent.harvardInstance(); - - Document doc = new Document( - "Extensible Markup Language (XML) 1.0 (Third Edition)", - DocumentType.REPORT); - doc.setPublisher(Agent.newW3CInstance()); - doc.setDate("2004-02-04"); - doc.setIdentifier(new Identifier("http://www.w3.org/TR/REC-xml", - IdentifierType.URL)); - _specification.add(doc); - - doc = new Document("SAX", DocumentType.WEB); - doc.setIdentifier(new Identifier("http://sax.sourceforge.net/", - IdentifierType.URL)); - _specification.add(doc); - - Signature sig = new ExternalSignature(".xml", SignatureType.EXTENSION, - SignatureUseType.OPTIONAL); - _signature.add(sig); - - // Initialize module parameters - resetParams(); - } - - /** - * Sets the value of the doctype string, assumed to have been forced - * to upper case. This is set only when the HTML module invokes the - * XML module for an XHTML document. - */ - public void setXhtmlDoctype(String doctype) { - _xhtmlDoctype = doctype; - if (_textMD != null) { - _textMD.setMarkup_language(_xhtmlDoctype); - } - } - - /** - * Reset parameter settings. - * Returns to a default state without any parameters. - */ - @Override - public void resetParams() { - _baseURL = null; - _localSchemas = new HashMap<>(); - _parseFromSig = false; - _sigWantsDecl = false; - _withTextMD = false; - } - - /** - * Parse configuration parameters for the module. - * - * If the parameter starts with "schema=", then the part to the - * right of the equals sign specifies a schema location URI - * followed by a path to a local copy of that schema to be used - * in its place, separated by a semicolon. Example: - * - * schema=http://example.com/schema.xsd;/schemas/example.xsd - * - * If the first character is "s" or "S", and the parameter isn't - * "schema", then XML document declarations are required for - * signature checks. - * - * If the parameter begins with "b" or "B", then the remainder of - * the parameter is used as a base URL for relative URIs. Otherwise - * it is ignored and there is no base URL. Example: - * - * bhttp://example.com/schemas/ - * - * If the parameter is "withtextmd=true", then textMD metadata is - * included in the JHOVE report. - * - * @param param - * the module parameter to parse. - */ - @Override - public void param(String param) { - if (param != null) { - param = param.trim(); - String lowerCaseParam = param.toLowerCase(); - if (lowerCaseParam.startsWith("schema=")) { - addLocalSchema(param); - } else if (lowerCaseParam.startsWith("s")) { - _sigWantsDecl = true; - } else if (lowerCaseParam.startsWith("b")) { - _baseURL = param.substring(1); - } else if (lowerCaseParam.equals("withtextmd=true")) { - _withTextMD = true; - } else { - _logger.warning("Ignoring unrecognized module parameter \"" - + param + "\""); - } - } - } - - /** - * Parse the content of a purported XML digital object and store the - * results in RepInfo. - * - * This is designed to be called in two passes. On the first pass, - * a non-validating parse is done. If this succeeds, and the presence - * of DTDs or schemas is detected, then parse returns 1 so that it - * will be called again to do a validating parse. If there is nothing - * to validate, we consider it "valid." - * - * @param stream - * An InputStream, positioned at its beginning, - * which is generated from the object to be parsed. - * If multiple calls to parse are made - * on the basis of a nonzero value being returned, - * a new InputStream must be provided each time. - * - * @param info - * A fresh (on the first call) RepInfo object - * which will be modified - * to reflect the results of the parsing - * If multiple calls to parse are made - * on the basis of a nonzero value being returned, - * the same RepInfo object should be passed with each - * call. - * - * @param parseIndex - * Must be 0 in first call to parse. If - * parse returns a nonzero value, it must be - * called again with parseIndex - * equal to that return value. - */ - @Override - public int parse(InputStream stream, RepInfo info, int parseIndex) { - - boolean canValidate = true; - super.initParse(); - info.setFormat(_format[0]); - info.setMimeType(_mimeType[0]); - info.setModule(this); - - if (_textMD == null || parseIndex == 0) { - _textMD = new TextMDMetadata(); - _xhtmlDoctype = null; - } - - // Setup the data stream, will determine if we use checksum stream - setupDataStream(stream, info); - - _propList = new LinkedList<>(); - _metadata = new Property("XMLMetadata", PropertyType.PROPERTY, - PropertyArity.LIST, _propList); - - XMLReader parser = null; - InputSource src = null; - XmlModuleHandler handler = null; - XmlLexicalHandler lexHandler = new XmlLexicalHandler(); - XmlDeclHandler declHandler = new XmlDeclHandler(); - - // The XmlDeclStream filters the characters, looking for an - // XML declaration, since there's no way to get that info - // out of SAX. - XmlDeclStream xds = new XmlDeclStream(_dstream); - try { - // Create an InputSource to feed the parser. - // If a SAX class was specified, use it, otherwise use - // the default parser. - src = new InputSource(xds); - // To correctly resolve relative URIs in XML, we need to know the - // XML document's system identifier, i.e. its location, in order - // to derive the base URI other URIs should be relative to. - // Unfortunately JHOVE doesn't currently provide such information - // to its modules. In lieu of that, this module has a parameter - // which can be set to be used as the base URI for all relative - // URI resolution in the document being parsed. - if (_baseURL != null) { - src.setSystemId(new File(_baseURL).toURI().toURL().toString()); - } - String saxClass = _je.getSaxClass(); - if (saxClass == null) { - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - parser = factory.newSAXParser().getXMLReader(); - } else { - parser = XMLReaderFactory.createXMLReader(saxClass); - } - handler = new XmlModuleHandler(); - handler.setXhtmlFlag(_xhtmlDoctype != null); - handler.setLocalSchemas(_localSchemas); - parser.setContentHandler(handler); - parser.setErrorHandler(handler); - parser.setEntityResolver(handler); - parser.setDTDHandler(handler); - try { - parser.setProperty( - "http://xml.org/sax/properties/lexical-handler", - lexHandler); - } catch (SAXException e) { - info.setMessage(new InfoMessage(MessageConstants.XML_HUL_5)); - } - try { - parser.setProperty( - "http://xml.org/sax/properties/declaration-handler", - declHandler); - } catch (SAXException e) { - info.setMessage(new InfoMessage(MessageConstants.XML_HUL_6)); - } - - } catch (Exception f) { + private static final String NAME = "XML-hul"; + private static final String RELEASE = "1.5.4"; + private static final int[] DATE = { 2024, 03, 05 }; + private static final String[] FORMAT = { "XML", "XHTML" }; + private static final String COVERAGE = "XML 1.0"; + private static final String[] MIMETYPE = { "text/xml", "application/xml", + "text/html" }; + private static final String WELLFORMED = "An XML file is well-formed if " + + "it meets the criteria defined in Section 2.1 of the XML " + + "specification (W3C Recommendation, 3rd edition, 2004-02-04)"; + private static final String VALIDITY = "An XML file is valid if " + + "well-formed, and the file has an associated DTD or XML Schema and " + + "the file meets the constraints defined by that DTD or Schema"; + private static final String REPINFO = "Additional representation " + + "information includes: version, endcoding, standalone flag, DTD or " + + "schema, namespaces, notations, character references, entities, " + + "processing instructions, and comments"; + private static final String NOTE = "This module determines " + + "well-formedness and validity using the SAX2-conforming parser " + + "specified by the invoking application"; + private static final String RIGHTS = "Copyright 2004-2007 by JSTOR and " + + "the President and Fellows of Harvard College. " + + "Released under the GNU Lesser General Public License."; + + /** Top-level property list. */ + protected List _propList; + + /** Top-level property. */ + protected Property _metadata; + + /** Doctype for XHTML documents only, otherwise null. */ + protected String _xhtmlDoctype; + + /** Base URL for DTDs. If null, all DTD URLs are absolute. */ + protected String _baseURL; + + /** + * Flag to control signature checking behavior. If true, + * checkSignatures insists on an XML document declaration; if + * false, it will parse the file if there is no document + * declaration. + */ + protected boolean _sigWantsDecl; + + /** + * Flag to indicate we're invoking the parser from checkSignatures. + * When true, it's up to checkSignatures to mark a signature as present. + */ + protected boolean _parseFromSig; + + /** Flag to indicate if TextMD metadata should be reported. */ + protected boolean _withTextMD; + + /** TextMD metadata for the file being processed. */ + protected TextMDMetadata _textMD; + + /** Map of URLs to locally stored schemas. */ + protected Map _localSchemas; + + /** + * Class constructor. + * + * Instantiate an XmlModule object. + */ + public XmlModule() { + super(NAME, RELEASE, DATE, FORMAT, COVERAGE, MIMETYPE, WELLFORMED, + VALIDITY, REPINFO, NOTE, RIGHTS, false); + + _vendor = Agent.harvardInstance(); + + Document doc = new Document( + "Extensible Markup Language (XML) 1.0 (Third Edition)", + DocumentType.REPORT); + doc.setPublisher(Agent.newW3CInstance()); + doc.setDate("2004-02-04"); + doc.setIdentifier(new Identifier("http://www.w3.org/TR/REC-xml", + IdentifierType.URL)); + _specification.add(doc); + + doc = new Document("SAX", DocumentType.WEB); + doc.setIdentifier(new Identifier("http://sax.sourceforge.net/", + IdentifierType.URL)); + _specification.add(doc); + + Signature sig = new ExternalSignature(".xml", SignatureType.EXTENSION, + SignatureUseType.OPTIONAL); + _signature.add(sig); + + // Initialize module parameters + resetParams(); + } + + /** + * Sets the value of the doctype string, assumed to have been forced + * to upper case. This is set only when the HTML module invokes the + * XML module for an XHTML document. + */ + public void setXhtmlDoctype(String doctype) { + _xhtmlDoctype = doctype; + if (_textMD != null) { + _textMD.setMarkup_language(_xhtmlDoctype); + } + } + + /** + * Reset parameter settings. + * Returns to a default state without any parameters. + */ + @Override + public void resetParams() { + _baseURL = null; + _localSchemas = new HashMap<>(); + _parseFromSig = false; + _sigWantsDecl = false; + _withTextMD = false; + } + + /** + * Parse configuration parameters for the module. + * + * If the parameter starts with "schema=", then the part to the + * right of the equals sign specifies a schema location URI + * followed by a path to a local copy of that schema to be used + * in its place, separated by a semicolon. Example: + * + * schema=http://example.com/schema.xsd;/schemas/example.xsd + * + * If the first character is "s" or "S", and the parameter isn't + * "schema", then XML document declarations are required for + * signature checks. + * + * If the parameter begins with "b" or "B", then the remainder of + * the parameter is used as a base URL for relative URIs. Otherwise + * it is ignored and there is no base URL. Example: + * + * bhttp://example.com/schemas/ + * + * If the parameter is "withtextmd=true", then textMD metadata is + * included in the JHOVE report. + * + * @param param + * the module parameter to parse. + */ + @Override + public void param(String param) { + if (param != null) { + param = param.trim(); + String lowerCaseParam = param.toLowerCase(); + if (lowerCaseParam.startsWith("schema=")) { + addLocalSchema(param); + } else if (lowerCaseParam.startsWith("s")) { + _sigWantsDecl = true; + } else if (lowerCaseParam.startsWith("b")) { + _baseURL = param.substring(1); + } else if (lowerCaseParam.equals("withtextmd=true")) { + _withTextMD = true; + } else { + _logger.warning("Ignoring unrecognized module parameter \"" + + param + "\""); + } + } + } + + /** + * Parse the content of a purported XML digital object and store the + * results in RepInfo. + * + * This is designed to be called in two passes. On the first pass, + * a non-validating parse is done. If this succeeds, and the presence + * of DTDs or schemas is detected, then parse returns 1 so that it + * will be called again to do a validating parse. If there is nothing + * to validate, we consider it "valid." + * + * @param stream + * An InputStream, positioned at its beginning, + * which is generated from the object to be parsed. + * If multiple calls to parse are made + * on the basis of a nonzero value being returned, + * a new InputStream must be provided each time. + * + * @param info + * A fresh (on the first call) RepInfo object + * which will be modified + * to reflect the results of the parsing + * If multiple calls to parse are made + * on the basis of a nonzero value being returned, + * the same RepInfo object should be passed with each + * call. + * + * @param parseIndex + * Must be 0 in first call to parse. If + * parse returns a nonzero value, it must be + * called again with parseIndex + * equal to that return value. + */ + @Override + public int parse(InputStream stream, RepInfo info, int parseIndex) { + + boolean canValidate = true; + super.initParse(); + info.setFormat(_format[0]); + info.setMimeType(_mimeType[0]); + info.setModule(this); + + if (_textMD == null || parseIndex == 0) { + _textMD = new TextMDMetadata(); + _xhtmlDoctype = null; + } + + // Setup the data stream, will determine if we use checksum stream + setupDataStream(stream, info); + + _propList = new LinkedList<>(); + _metadata = new Property("XMLMetadata", PropertyType.PROPERTY, + PropertyArity.LIST, _propList); + + XMLReader parser = null; + InputSource src = null; + XmlModuleHandler handler = null; + XmlLexicalHandler lexHandler = new XmlLexicalHandler(); + XmlDeclHandler declHandler = new XmlDeclHandler(); + + // The XmlDeclStream filters the characters, looking for an + // XML declaration, since there's no way to get that info + // out of SAX. + XmlDeclStream xds = new XmlDeclStream(_dstream); + try { + // Create an InputSource to feed the parser. + // If a SAX class was specified, use it, otherwise use + // the default parser. + src = new InputSource(xds); + // To correctly resolve relative URIs in XML, we need to know the + // XML document's system identifier, i.e. its location, in order + // to derive the base URI other URIs should be relative to. + // Unfortunately JHOVE doesn't currently provide such information + // to its modules. In lieu of that, this module has a parameter + // which can be set to be used as the base URI for all relative + // URI resolution in the document being parsed. + if (_baseURL != null) { + src.setSystemId(new File(_baseURL).toURI().toURL().toString()); + } + String saxClass = _je.getSaxClass(); + if (saxClass == null) { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + parser = factory.newSAXParser().getXMLReader(); + } else { + parser = XMLReaderFactory.createXMLReader(saxClass); + } + handler = new XmlModuleHandler(); + handler.setXhtmlFlag(_xhtmlDoctype != null); + handler.setLocalSchemas(_localSchemas); + parser.setContentHandler(handler); + parser.setErrorHandler(handler); + parser.setEntityResolver(handler); + parser.setDTDHandler(handler); + try { + parser.setProperty( + "http://xml.org/sax/properties/lexical-handler", + lexHandler); + } catch (SAXException e) { + info.setMessage(new InfoMessage(MessageConstants.XML_HUL_5)); + } + try { + parser.setProperty( + "http://xml.org/sax/properties/declaration-handler", + declHandler); + } catch (SAXException e) { + info.setMessage(new InfoMessage(MessageConstants.XML_HUL_6)); + } + + } catch (Exception f) { info.setMessage(new ErrorMessage(CoreMessageConstants.JHOVE_CORE_5, f.getMessage())); - info.setWellFormed(false); // actually not the file's fault - return 0; - } - try { - // On the first pass, we parse without validation. - parser.setFeature("http://xml.org/sax/features/validation", - parseIndex != 0); - } catch (SAXException se) { - if (parseIndex != 0) { - info.setMessage(new InfoMessage(MessageConstants.XML_HUL_8)); - } - canValidate = false; - } - try { - parser.setFeature("http://xml.org/sax/features/namespaces", true); - } catch (SAXException se) { - info.setMessage(new InfoMessage(MessageConstants.XML_HUL_7)); - } - // This property for supporting schemas is a JAXP 1.2 - // recommendation, not likely to be supported widely as - // of this (February 2004) writing, and not supported in - // standard Crimson. But it looks like the way to prepare - // for schema validation in the future, and at least the - // info message will tell users why they're getting bogus - // invalid status. - - // Try 2 different ways of setting schema validation; - // it appears that no one way works for all parsers. - if (parseIndex > 0) { - try { - parser.setFeature( - "http://apache.org/xml/features/validation/schema", - true); - } catch (SAXException ee) { - try { - parser.setProperty( - "http://java.sun.com/xml/jaxp/properties/schemaLanguage", - "http://www.w3.org/2001/XMLSchema"); - } catch (SAXException e) { - info.setMessage( - new InfoMessage(MessageConstants.XML_HUL_9)); - } - } - } - try { - parser.parse(src); - } catch (FileNotFoundException fnfe) { - // Make this particular exception a little more user-friendly - info.setMessage(new ErrorMessage(MessageConstants.XML_HUL_10, - fnfe.getMessage())); - info.setWellFormed(false); - return 0; - } catch (UTFDataFormatException udfe) { - if (handler.getSigFlag() && !_parseFromSig) { - info.setSigMatch(_name); - } - info.setMessage(new ErrorMessage(MessageConstants.XML_HUL_11)); - info.setWellFormed(false); - return 0; - } catch (IOException ioe) { - // We may get an IOException from trying to resolve an - // external entity. - if (handler.getSigFlag() && !_parseFromSig) { - info.setSigMatch(_name); - } - String mess = ioe.getClass().getName() + ": " + ioe.getMessage(); - info.setMessage(new ErrorMessage( + info.setWellFormed(false); // actually not the file's fault + return 0; + } + try { + // On the first pass, we parse without validation. + parser.setFeature("http://xml.org/sax/features/validation", + parseIndex != 0); + } catch (SAXException se) { + if (parseIndex != 0) { + info.setMessage(new InfoMessage(MessageConstants.XML_HUL_8)); + } + canValidate = false; + } + try { + parser.setFeature("http://xml.org/sax/features/namespaces", true); + } catch (SAXException se) { + info.setMessage(new InfoMessage(MessageConstants.XML_HUL_7)); + } + // This property for supporting schemas is a JAXP 1.2 + // recommendation, not likely to be supported widely as + // of this (February 2004) writing, and not supported in + // standard Crimson. But it looks like the way to prepare + // for schema validation in the future, and at least the + // info message will tell users why they're getting bogus + // invalid status. + + // Try 2 different ways of setting schema validation; + // it appears that no one way works for all parsers. + if (parseIndex > 0) { + try { + parser.setFeature( + "http://apache.org/xml/features/validation/schema", + true); + } catch (SAXException ee) { + try { + parser.setProperty( + "http://java.sun.com/xml/jaxp/properties/schemaLanguage", + "http://www.w3.org/2001/XMLSchema"); + } catch (SAXException e) { + info.setMessage( + new InfoMessage(MessageConstants.XML_HUL_9)); + } + } + } + try { + parser.parse(src); + } catch (FileNotFoundException fnfe) { + // Make this particular exception a little more user-friendly + info.setMessage(new ErrorMessage(MessageConstants.XML_HUL_10, + fnfe.getMessage())); + info.setWellFormed(false); + return 0; + } catch (UTFDataFormatException udfe) { + if (handler.getSigFlag() && !_parseFromSig) { + info.setSigMatch(_name); + } + info.setMessage(new ErrorMessage(MessageConstants.XML_HUL_11)); + info.setWellFormed(false); + return 0; + } catch (IOException ioe) { + // We may get an IOException from trying to resolve an + // external entity. + if (handler.getSigFlag() && !_parseFromSig) { + info.setSigMatch(_name); + } + String mess = ioe.getClass().getName() + ": " + ioe.getMessage(); + info.setMessage(new ErrorMessage( CoreMessageConstants.JHOVE_CORE_2, mess)); - info.setWellFormed(false); - return 0; - } catch (SAXParseException spe) { - // Document failed to parse. - if (handler.getSigFlag() && !_parseFromSig) { - info.setSigMatch(_name); - } - info.setMessage(new ErrorMessage( - MessageConstants.XML_HUL_1, - MessageFormat.format( - MessageConstants.XML_HUL_1_SUB.getMessage(), - spe.getMessage(), - spe.getLineNumber(), - spe.getColumnNumber()))); - info.setWellFormed(false); - return 0; - } catch (SAXException se) { - // Other SAX error. - if (handler.getSigFlag()) { - info.setSigMatch(_name); - } - // Sometimes the message will be null and another message - // wrapped inside it. Try to report that. - JhoveMessage message = JhoveMessages.getMessageInstance( - MessageConstants.XML_HUL_3.getId(), - MessageFormat.format( - MessageConstants.XML_HUL_3.getMessage(), - se.getMessage() != null ? se.getMessage() : "")); - Throwable ee = se.getCause(); - String subMess = (ee != null) - ? MessageFormat.format( - MessageConstants.XML_HUL_12.getMessage(), - ee.getClass().getName()) - : MessageConstants.XML_HUL_13.getMessage(); - info.setMessage(new ErrorMessage(message, subMess)); - info.setWellFormed(false); - return 0; - } - - // Check if user has aborted - if (_je.getAbort()) { - return 0; - } - - if (handler.getSigFlag() && parseIndex == 0) { - info.setSigMatch(_name); - } - // If it's the first pass, check if we found a DTD - // or schema. If so, re-parse with validation enabled. - String dtdURI = handler.getDTDURI(); - List schemaList = handler.getSchemas(); - - // To find the "primary" markup language we check the following, - // in order of preference: - // 1) the first schema's namespace URI - // 2) the first schema's location URI - // 3) the DTD's URI - // It should be noted that later on, when we check the namespace - // of the root element, if it has an associated URI, that will - // be used instead. - boolean hasRootSchema = false; - if (!schemaList.isEmpty()) { - SchemaInfo schItems = schemaList.get(0); - if (isNotEmpty(schItems.namespaceURI)) { - _textMD.setMarkup_language(schItems.namespaceURI); - } else if (isNotEmpty(schItems.location)) { - _textMD.setMarkup_language(schItems.location); - } - - if (isNotEmpty(schItems.location) - || (isNotEmpty(schItems.namespaceURI) && _localSchemas.containsKey(schItems.namespaceURI))) { - hasRootSchema = true; - } - } else if (isNotEmpty(dtdURI)) { - _textMD.setMarkup_language(dtdURI); - hasRootSchema = true; - } - - if (parseIndex == 0) { - if (canValidate && hasRootSchema) { - return 1; - } - info.setValid(RepInfo.UNDETERMINED); - // This may get downgraded to false, but won't - // be upgraded to true. - } - - // Take a deep breath. We parsed it. Now assemble the properties. - info.setProperty(_metadata); - - // If it's XHTML, add the HTML property. - HtmlMetadata hMetadata = handler.getHtmlMetadata(); - if (hMetadata != null) { - info.setProperty( - hMetadata.toProperty(_withTextMD ? _textMD : null)); - } - - // Report the parser in a property. - _propList.add(new Property("Parser", PropertyType.STRING, - parser.getClass().getName())); - - // Add the version property. Give precedence to XHTML doctype. - String vers = null; - if (_xhtmlDoctype != null) { - vers = DTDMapper.getXHTMLVersion(_xhtmlDoctype); - _textMD.setMarkup_language_version(vers); - } - if (vers != null) { - info.setVersion(vers); - } else { - vers = xds.getVersion(); - if (vers != null) { - info.setVersion(vers); - } - } - _textMD.setMarkup_basis_version(vers); - - // Add the encoding property. - String encoding = xds.getEncoding(); - if (encoding == null) { - // If no explicit encoding, use default (Bugzilla 136) - encoding = "UTF-8"; - } - _propList.add(new Property("Encoding", PropertyType.STRING, encoding)); - - _textMD.setCharset(encoding); - _textMD.setByte_size("8"); - _textMD.setByte_order(_bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE); - _textMD.setCharacter_size(_textMD.getCharset().contains("UTF") ? "variable" : "1"); - - // CRLF from XmlDeclStream ... - String lineEnd = xds.getKindOfLineEnd(); - if (lineEnd == null) { - info.setMessage(new InfoMessage(MessageConstants.XML_HUL_4)); - _textMD.setLinebreak(TextMDMetadata.NILL); - } else if ("CR".equalsIgnoreCase(lineEnd)) { - _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR); - } else if ("LF".equalsIgnoreCase(lineEnd)) { - _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF); - } else if ("CRLF".equalsIgnoreCase(lineEnd)) { - _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF); - } - - // Add the standalone property. - String sa = xds.getStandalone(); - if (sa != null) { - _propList.add(new Property("Standalone", PropertyType.STRING, sa)); - } - - // Add the DTD property. - if (dtdURI != null) { - _propList.add(new Property("DTD_URI", PropertyType.STRING, dtdURI)); - } - - if (!schemaList.isEmpty()) { - // Build a List of Properties, which will be the value - // of the Schemas Property. - List schemaPropList = new ArrayList<>(schemaList.size()); - // Iterate through all the schemas. - for (SchemaInfo schema : schemaList) { - // Build a Property (Schema) whose value is an array - // of two Properties (NamespaceURI and SchemaLocation). - Property[] schItemProps = new Property[2]; - schItemProps[0] = new Property("NamespaceURI", - PropertyType.STRING, schema.namespaceURI); - schItemProps[1] = new Property("SchemaLocation", - PropertyType.STRING, schema.location); - schemaPropList.add(new Property("Schema", - PropertyType.PROPERTY, - PropertyArity.ARRAY, - schItemProps)); - } - // Now add the list to the metadata - _propList.add(new Property("Schemas", - PropertyType.PROPERTY, - PropertyArity.LIST, - schemaPropList)); - } - - // Add the root element. - String root = handler.getRoot(); - String rootPrefix = null; - if (root != null) { - _propList.add(new Property("Root", PropertyType.STRING, root)); - if ("html".equals(root)) { - // Specify format as XHTML - info.setFormat(_format[1]); - // Set the version according to the doctype... how? - - } - // Get the prefix of root - int indexOfColon = root.indexOf(':'); - if (indexOfColon != -1) { - rootPrefix = root.substring(0, indexOfColon); - } - } - if (rootPrefix == null) { - rootPrefix = ""; - } - - // Declare properties we're going to add. They have - // some odd interdependencies, so we create them all - // and then add them in the right (specified) order. - Property namespaceProp = null; - Property notationsProp = null; - Property charRefsProp = null; - Property entitiesProp = null; - Property procInstProp = null; - Property commentProp = null; - Property unicodeBlocksProp = null; - - Map ns = handler.getNamespaces(); - if (!ns.isEmpty()) { - Set keys = ns.keySet(); - List nsList = new ArrayList<>(keys.size()); - for (String key : keys) { - String val = ns.get(key); - Property[] supPropArr = new Property[2]; - supPropArr[0] = new Property("Prefix", - PropertyType.STRING, key); - supPropArr[1] = new Property("URI", - PropertyType.STRING, val); - Property onens = new Property("Namespace", - PropertyType.PROPERTY, - PropertyArity.ARRAY, - supPropArr); - nsList.add(onens); - - // Try to find the namespace URI of root - if (rootPrefix.equalsIgnoreCase(key) && isNotEmpty(val)) { - _textMD.setMarkup_language(val); - } - } - namespaceProp = new Property("Namespaces", - PropertyType.PROPERTY, - PropertyArity.LIST, - nsList); - } - - // CharacterReferences property goes here. - // Report as a list of 4-digit hexadecimal strings, - // e.g., 003C, 04AA, etc. - // Also build the Unicode blocks here. - List refs = xds.getCharacterReferences(); - if (!refs.isEmpty()) { - Utf8BlockMarker utf8BM = new Utf8BlockMarker(); - List refList = new ArrayList<>(refs.size()); - for (Integer refint : refs) { - refList.add(intTo4DigitHex(refint)); - utf8BM.markBlock(refint); - } - charRefsProp = new Property("CharacterReferences", - PropertyType.STRING, - PropertyArity.LIST, - refList); - unicodeBlocksProp = utf8BM - .getBlocksUsedProperty("UnicodeCharRefBlocks"); - } - - // Entities property - // External unparsed entities - Set entNames = lexHandler.getEntityNames(); - Set attributeVals = handler.getAttributeValues(); - List entProps = new LinkedList<>(); - List uent = handler.getUnparsedEntities(); - List unparsedNotationNames = new LinkedList<>(); - if (!uent.isEmpty()) { - for (String[] entarr : uent) { - // We check external parsed entities against - // the list of attribute values which we've - // accumulated. If a parsed entity name matches an - // attribute value, we assume it's used. - String name = entarr[0]; - if (attributeVals.contains(name)) { - // Add the notation name to the list - // unparsedNotationNames, so we can use it - // in determining which notations are used. - unparsedNotationNames.add(entarr[3]); - List subPropList = new ArrayList<>(6); - subPropList.add(new Property("Name", - PropertyType.STRING, name)); - subPropList.add(new Property("Type", - PropertyType.STRING, "External unparsed")); - subPropList.add(new Property("PublicID", - PropertyType.STRING, entarr[1])); - subPropList.add(new Property("SystemID", - PropertyType.STRING, entarr[2])); - subPropList.add(new Property("NotationName", - PropertyType.STRING, entarr[3])); - - entProps.add(new Property("Entity", - PropertyType.PROPERTY, - PropertyArity.LIST, - subPropList)); - } - } - } - - // Internal entities - List declEnts = declHandler.getInternalEntityDeclarations(); - if (!declEnts.isEmpty()) { - for (String[] entarr : declEnts) { - String name = entarr[0]; - // include only if the entity was actually used - if (entNames.contains(name)) { - List subPropList = new ArrayList<>(4); - subPropList.add(new Property("Name", - PropertyType.STRING, name)); - subPropList.add(new Property("Type", - PropertyType.STRING, "Internal")); - subPropList.add(new Property("Value", - PropertyType.STRING, entarr[1])); - entProps.add(new Property("Entity", - PropertyType.PROPERTY, - PropertyArity.LIST, - subPropList)); - } - } - } - - // External parsed entities - declEnts = declHandler.getExternalEntityDeclarations(); - if (!declEnts.isEmpty()) { - for (String[] entarr : declEnts) { - String name = entarr[0]; - // include only if the entity was actually used - if (entNames.contains(name)) { - List subPropList = new ArrayList<>(4); - subPropList.add(new Property("Name", - PropertyType.STRING, name)); - subPropList.add(new Property("Type", - PropertyType.STRING, "External parsed")); - if (entarr[1] != null) { - subPropList.add(new Property("PublicID", - PropertyType.STRING, entarr[1])); - } - if (entarr[2] != null) { - subPropList.add(new Property("SystemID", - PropertyType.STRING, entarr[2])); - } - - entProps.add(new Property("Entity", - PropertyType.PROPERTY, - PropertyArity.LIST, - subPropList)); - } - } - } - - if (!entProps.isEmpty()) { - entitiesProp = new Property("Entities", - PropertyType.PROPERTY, - PropertyArity.LIST, - entProps); - } - - List pi = handler - .getProcessingInstructions(); - List piTargets = new LinkedList<>(); - if (!pi.isEmpty()) { - // Build a property, which consists of a list - // of properties, each of which is an array of - // two String properties, named Target and - // Data respectively. - List piPropList = new ArrayList<>(pi.size()); - for (ProcessingInstructionInfo pistr : pi) { - Property[] subPropArr = new Property[2]; - // Accumulate targets in a list, so we can tell - // which Notations use them. - // Wait a minute -- what we're doing here can't work!! - // TODO: What's supposed to be happening? - // piTargets.add (subPropArr[0]); - subPropArr[0] = new Property("Target", - PropertyType.STRING, pistr.target); - subPropArr[1] = new Property("Data", - PropertyType.STRING, pistr.data); - piPropList.add(new Property("ProcessingInstruction", - PropertyType.PROPERTY, - PropertyArity.ARRAY, - subPropArr)); - } - procInstProp = new Property("ProcessingInstructions", - PropertyType.PROPERTY, - PropertyArity.LIST, - piPropList); - } - - // Notations property. We list notations only if they're - // "actually used," meaning that they designate either - // the target of a processing instruction or the ndata - // of an unparsed entry which is itself "actually used." - List notations = handler.getNotations(); - if (!notations.isEmpty()) { - List notProps = new ArrayList<>(notations.size()); - ListIterator iter = notations.listIterator(); - List subPropList = new ArrayList<>(3); - while (iter.hasNext()) { - String[] notArray = iter.next(); - String notName = notArray[0]; - // Check for use of Notation before including - // TODO this is implemented wrong! Need to reinvestigate - if (piTargets.contains(notName) - || unparsedNotationNames.contains(notName)) { - // notArray has name, public ID, system ID - subPropList.add(new Property("Name", - PropertyType.STRING, notName)); - if (notArray[1] != null) { - subPropList.add(new Property("PublicID", - PropertyType.STRING, notArray[1])); - } - if (notArray[2] != null) { - subPropList.add(new Property("SystemID", - PropertyType.STRING, notArray[2])); - } - notProps.add(new Property("Notation", - PropertyType.PROPERTY, - PropertyArity.LIST, - subPropList)); - } - } - // Recheck emptiness in case only unprocessed notations were found - if (!notProps.isEmpty()) { - notationsProp = new Property("Notations", - PropertyType.PROPERTY, - PropertyArity.LIST, - notProps); - } - } - - // Now add all the properties we created. - if (namespaceProp != null) { - _propList.add(namespaceProp); - } - if (notationsProp != null) { - _propList.add(notationsProp); - } - if (charRefsProp != null) { - _propList.add(charRefsProp); - } - if (unicodeBlocksProp != null) { - _propList.add(unicodeBlocksProp); - } - if (entitiesProp != null) { - _propList.add(entitiesProp); - } - if (procInstProp != null) { - _propList.add(procInstProp); - } - - List comm = lexHandler.getComments(); - if (!comm.isEmpty()) { - commentProp = new Property("Comments", - PropertyType.STRING, - PropertyArity.LIST, - comm); - } - if (commentProp != null) { - _propList.add(commentProp); - } - - // Check if parse detected invalid XML - if (!handler.isValid()) { - info.setValid(false); - } - - if (info.getWellFormed() == RepInfo.TRUE) { - if (_xhtmlDoctype != null) { - info.setMimeType(_mimeType[2]); - } else { - info.setMimeType(_mimeType[0]); - } - } - - // Add any messages from the parse. - List msgs = handler.getMessages(); - for (Message msg : msgs) { - info.setMessage(msg); - } - - if (info.getVersion() == null) { - info.setVersion("1.0"); - } - - if (_withTextMD) { - _textMD.setMarkup_basis(info.getFormat()); - _textMD.setMarkup_basis_version(info.getVersion()); - Property property = new Property("TextMDMetadata", - PropertyType.TEXTMDMETADATA, - PropertyArity.SCALAR, _textMD); - _propList.add(property); - } - - // Set the checksums in the report if they're calculated - setChecksums(this._ckSummer, info); - - return 0; - } - - /** - * Check if the digital object conforms to this Module's - * internal signature information. - * - * XML is a particularly messy case; in general, there's no - * even moderately good way to check "signatures" without parsing - * the whole file, since the document declaration is optional. - * We provide the user two choices, based on the "s" parameter. - * If 's' is the first character of the module parameter, then - * we look for an XML document declaration, and say there's no - * signature if it's missing. (This can reject well-formed - * XML files, though not valid ones.) Otherwise, if there's no - * document declaration, we parse the whole file. - * - * @param file - * A File object for the object being parsed - * @param stream - * An InputStream, positioned at its beginning, - * which is generated from the object to be parsed - * @param info - * A fresh RepInfo object which will be modified - * to reflect the results of the test - */ - @Override - public void checkSignatures(File file, InputStream stream, RepInfo info) - throws IOException { - _parseFromSig = false; - info.setFormat(_format[0]); - info.setMimeType(_mimeType[0]); - info.setModule(this); - String sigStr = "= sigStr.length()) { - info.setSigMatch(_name); - return; // sig matches - } - } else - break; - } - } catch (IOException e) { - info.setWellFormed(false); - return; - } - if (_sigWantsDecl) { - // No XML declaration, and it's mandatory according to the param. - info.setWellFormed(false); - return; - } - - // No XML signature, but we're allowed to parse the file now. - // This means rewinding back to the start of the file. - int parseIndex = 1; - _parseFromSig = true; // we set the sig match ourselves - while (parseIndex != 0) { - stream.close(); - stream = new FileInputStream(file); - parseIndex = parse(stream, info, parseIndex); - } - if (info.getWellFormed() == RepInfo.TRUE) { - info.setSigMatch(_name); - } - } - - /** - * Converts an int to a 4-digit hex value, e.g., - * 003F or F10A. This is used for Character References. - */ - protected static String intTo4DigitHex(int n) { - StringBuilder sb = new StringBuilder(4); - for (int i = 3; i >= 0; i--) { - int d = (n >> (4 * i)) & 0XF; // extract a nybble - if (d < 10) { - sb.append((char) ('0' + d)); - } else { - sb.append((char) ('A' + (d - 10))); - } - } - return sb.toString(); - } - - /** - * Check that a string contains something other than "[None]". - * - * @param value - * string to test - * @return - * true if the string contains something - * other than "[None]", false otherwise - */ - protected static boolean isNotEmpty(String value) { - return ((value != null) && (value.length() != 0) - && !("[None]".equals(value))); - } - - /** - * Parse a "schema" configuration argument and map the schema - * location URI to a local file after validating both components. - * - * @param param - * a module parameter string of the form - * "schema=[location-URI];[local-path]" - */ - private void addLocalSchema(String param) { - int eq = param.indexOf('='); - int semi = param.indexOf(';'); - try { - String uriParam = param.substring(eq + 1, semi); - String localParam = param.substring(semi + 1); - try { - String locationUri = new URI(uriParam).toString(); - File localFile = new File(localParam); - if (localFile.exists()) { - _localSchemas.put(locationUri, localFile); - } else { - _logger.warning("Ignoring module parameter with " - + "unresolvable path: \"" + localParam + "\""); - } - } catch (URISyntaxException use) { - _logger.warning("Ignoring module parameter with " - + "invalid URI syntax: \"" + uriParam + "\""); - } - } catch (IndexOutOfBoundsException ioobe) { - _logger.warning("Ignoring malformed module parameter \"" - + param + "\""); - } - } + info.setWellFormed(false); + return 0; + } catch (SAXParseException spe) { + // Document failed to parse. + if (handler.getSigFlag() && !_parseFromSig) { + info.setSigMatch(_name); + } + info.setMessage(new ErrorMessage( + MessageConstants.XML_HUL_1, + MessageFormat.format( + MessageConstants.XML_HUL_1_SUB.getMessage(), + spe.getMessage(), + spe.getLineNumber(), + spe.getColumnNumber()))); + info.setWellFormed(false); + return 0; + } catch (SAXException se) { + // Other SAX error. + if (handler.getSigFlag()) { + info.setSigMatch(_name); + } + // Sometimes the message will be null and another message + // wrapped inside it. Try to report that. + JhoveMessage message = JhoveMessages.getMessageInstance( + MessageConstants.XML_HUL_3.getId(), + MessageFormat.format( + MessageConstants.XML_HUL_3.getMessage(), + se.getMessage() != null ? se.getMessage() : "")); + Throwable ee = se.getCause(); + String subMess = (ee != null) + ? MessageFormat.format( + MessageConstants.XML_HUL_12.getMessage(), + ee.getClass().getName()) + : MessageConstants.XML_HUL_13.getMessage(); + info.setMessage(new ErrorMessage(message, subMess)); + info.setWellFormed(false); + return 0; + } + + // Check if user has aborted + if (_je.getAbort()) { + return 0; + } + + if (handler.getSigFlag() && parseIndex == 0) { + info.setSigMatch(_name); + } + // If it's the first pass, check if we found a DTD + // or schema. If so, re-parse with validation enabled. + String dtdURI = handler.getDTDURI(); + List schemaList = handler.getSchemas(); + + // To find the "primary" markup language we check the following, + // in order of preference: + // 1) the first schema's namespace URI + // 2) the first schema's location URI + // 3) the DTD's URI + // It should be noted that later on, when we check the namespace + // of the root element, if it has an associated URI, that will + // be used instead. + boolean hasRootSchema = false; + if (!schemaList.isEmpty()) { + SchemaInfo schItems = schemaList.get(0); + if (isNotEmpty(schItems.namespaceURI)) { + _textMD.setMarkup_language(schItems.namespaceURI); + } else if (isNotEmpty(schItems.location)) { + _textMD.setMarkup_language(schItems.location); + } + + if (isNotEmpty(schItems.location) + || (isNotEmpty(schItems.namespaceURI) && _localSchemas.containsKey(schItems.namespaceURI))) { + hasRootSchema = true; + } + } else if (isNotEmpty(dtdURI)) { + _textMD.setMarkup_language(dtdURI); + hasRootSchema = true; + } + + if (parseIndex == 0) { + if (canValidate && hasRootSchema) { + return 1; + } + info.setValid(RepInfo.UNDETERMINED); + // This may get downgraded to false, but won't + // be upgraded to true. + } + + // Take a deep breath. We parsed it. Now assemble the properties. + info.setProperty(_metadata); + + // If it's XHTML, add the HTML property. + HtmlMetadata hMetadata = handler.getHtmlMetadata(); + if (hMetadata != null) { + info.setProperty( + hMetadata.toProperty(_withTextMD ? _textMD : null)); + } + + // Report the parser in a property. + _propList.add(new Property("Parser", PropertyType.STRING, + parser.getClass().getName())); + + // Add the version property. Give precedence to XHTML doctype. + String vers = null; + if (_xhtmlDoctype != null) { + vers = DTDMapper.getXHTMLVersion(_xhtmlDoctype); + _textMD.setMarkup_language_version(vers); + } + if (vers != null) { + info.setVersion(vers); + } else { + vers = xds.getVersion(); + if (vers != null) { + info.setVersion(vers); + } + } + _textMD.setMarkup_basis_version(vers); + + // Add the encoding property. + String encoding = xds.getEncoding(); + if (encoding == null) { + // If no explicit encoding, use default (Bugzilla 136) + encoding = "UTF-8"; + } + _propList.add(new Property("Encoding", PropertyType.STRING, encoding)); + + _textMD.setCharset(encoding); + _textMD.setByte_size("8"); + _textMD.setByte_order(_bigEndian ? TextMDMetadata.BYTE_ORDER_BIG : TextMDMetadata.BYTE_ORDER_LITTLE); + _textMD.setCharacter_size(_textMD.getCharset().contains("UTF") ? "variable" : "1"); + + // CRLF from XmlDeclStream ... + String lineEnd = xds.getKindOfLineEnd(); + if (lineEnd == null) { + info.setMessage(new InfoMessage(MessageConstants.XML_HUL_4)); + _textMD.setLinebreak(TextMDMetadata.NILL); + } else if ("CR".equalsIgnoreCase(lineEnd)) { + _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CR); + } else if ("LF".equalsIgnoreCase(lineEnd)) { + _textMD.setLinebreak(TextMDMetadata.LINEBREAK_LF); + } else if ("CRLF".equalsIgnoreCase(lineEnd)) { + _textMD.setLinebreak(TextMDMetadata.LINEBREAK_CRLF); + } + + // Add the standalone property. + String sa = xds.getStandalone(); + if (sa != null) { + _propList.add(new Property("Standalone", PropertyType.STRING, sa)); + } + + // Add the DTD property. + if (dtdURI != null) { + _propList.add(new Property("DTD_URI", PropertyType.STRING, dtdURI)); + } + + if (!schemaList.isEmpty()) { + // Build a List of Properties, which will be the value + // of the Schemas Property. + List schemaPropList = new ArrayList<>(schemaList.size()); + // Iterate through all the schemas. + for (SchemaInfo schema : schemaList) { + // Build a Property (Schema) whose value is an array + // of two Properties (NamespaceURI and SchemaLocation). + Property[] schItemProps = new Property[2]; + schItemProps[0] = new Property("NamespaceURI", + PropertyType.STRING, schema.namespaceURI); + schItemProps[1] = new Property("SchemaLocation", + PropertyType.STRING, schema.location); + schemaPropList.add(new Property("Schema", + PropertyType.PROPERTY, + PropertyArity.ARRAY, + schItemProps)); + } + // Now add the list to the metadata + _propList.add(new Property("Schemas", + PropertyType.PROPERTY, + PropertyArity.LIST, + schemaPropList)); + } + + // Add the root element. + String root = handler.getRoot(); + String rootPrefix = null; + if (root != null) { + _propList.add(new Property("Root", PropertyType.STRING, root)); + if ("html".equals(root)) { + // Specify format as XHTML + info.setFormat(_format[1]); + // Set the version according to the doctype... how? + + } + // Get the prefix of root + int indexOfColon = root.indexOf(':'); + if (indexOfColon != -1) { + rootPrefix = root.substring(0, indexOfColon); + } + } + if (rootPrefix == null) { + rootPrefix = ""; + } + + // Declare properties we're going to add. They have + // some odd interdependencies, so we create them all + // and then add them in the right (specified) order. + Property namespaceProp = null; + Property notationsProp = null; + Property charRefsProp = null; + Property entitiesProp = null; + Property procInstProp = null; + Property commentProp = null; + Property unicodeBlocksProp = null; + + Map ns = handler.getNamespaces(); + if (!ns.isEmpty()) { + Set keys = ns.keySet(); + List nsList = new ArrayList<>(keys.size()); + for (String key : keys) { + String val = ns.get(key); + Property[] supPropArr = new Property[2]; + supPropArr[0] = new Property("Prefix", + PropertyType.STRING, key); + supPropArr[1] = new Property("URI", + PropertyType.STRING, val); + Property onens = new Property("Namespace", + PropertyType.PROPERTY, + PropertyArity.ARRAY, + supPropArr); + nsList.add(onens); + + // Try to find the namespace URI of root + if (rootPrefix.equalsIgnoreCase(key) && isNotEmpty(val)) { + _textMD.setMarkup_language(val); + } + } + namespaceProp = new Property("Namespaces", + PropertyType.PROPERTY, + PropertyArity.LIST, + nsList); + } + + // CharacterReferences property goes here. + // Report as a list of 4-digit hexadecimal strings, + // e.g., 003C, 04AA, etc. + // Also build the Unicode blocks here. + List refs = xds.getCharacterReferences(); + if (!refs.isEmpty()) { + Utf8BlockMarker utf8BM = new Utf8BlockMarker(); + List refList = new ArrayList<>(refs.size()); + for (Integer refint : refs) { + refList.add(intTo4DigitHex(refint)); + utf8BM.markBlock(refint); + } + charRefsProp = new Property("CharacterReferences", + PropertyType.STRING, + PropertyArity.LIST, + refList); + unicodeBlocksProp = utf8BM + .getBlocksUsedProperty("UnicodeCharRefBlocks"); + } + + // Entities property + // External unparsed entities + Set entNames = lexHandler.getEntityNames(); + Set attributeVals = handler.getAttributeValues(); + List entProps = new LinkedList<>(); + List uent = handler.getUnparsedEntities(); + List unparsedNotationNames = new LinkedList<>(); + if (!uent.isEmpty()) { + for (String[] entarr : uent) { + // We check external parsed entities against + // the list of attribute values which we've + // accumulated. If a parsed entity name matches an + // attribute value, we assume it's used. + String name = entarr[0]; + if (attributeVals.contains(name)) { + // Add the notation name to the list + // unparsedNotationNames, so we can use it + // in determining which notations are used. + unparsedNotationNames.add(entarr[3]); + List subPropList = new ArrayList<>(6); + subPropList.add(new Property("Name", + PropertyType.STRING, name)); + subPropList.add(new Property("Type", + PropertyType.STRING, "External unparsed")); + subPropList.add(new Property("PublicID", + PropertyType.STRING, entarr[1])); + subPropList.add(new Property("SystemID", + PropertyType.STRING, entarr[2])); + subPropList.add(new Property("NotationName", + PropertyType.STRING, entarr[3])); + + entProps.add(new Property("Entity", + PropertyType.PROPERTY, + PropertyArity.LIST, + subPropList)); + } + } + } + + // Internal entities + List declEnts = declHandler.getInternalEntityDeclarations(); + if (!declEnts.isEmpty()) { + for (String[] entarr : declEnts) { + String name = entarr[0]; + // include only if the entity was actually used + if (entNames.contains(name)) { + List subPropList = new ArrayList<>(4); + subPropList.add(new Property("Name", + PropertyType.STRING, name)); + subPropList.add(new Property("Type", + PropertyType.STRING, "Internal")); + subPropList.add(new Property("Value", + PropertyType.STRING, entarr[1])); + entProps.add(new Property("Entity", + PropertyType.PROPERTY, + PropertyArity.LIST, + subPropList)); + } + } + } + + // External parsed entities + declEnts = declHandler.getExternalEntityDeclarations(); + if (!declEnts.isEmpty()) { + for (String[] entarr : declEnts) { + String name = entarr[0]; + // include only if the entity was actually used + if (entNames.contains(name)) { + List subPropList = new ArrayList<>(4); + subPropList.add(new Property("Name", + PropertyType.STRING, name)); + subPropList.add(new Property("Type", + PropertyType.STRING, "External parsed")); + if (entarr[1] != null) { + subPropList.add(new Property("PublicID", + PropertyType.STRING, entarr[1])); + } + if (entarr[2] != null) { + subPropList.add(new Property("SystemID", + PropertyType.STRING, entarr[2])); + } + + entProps.add(new Property("Entity", + PropertyType.PROPERTY, + PropertyArity.LIST, + subPropList)); + } + } + } + + if (!entProps.isEmpty()) { + entitiesProp = new Property("Entities", + PropertyType.PROPERTY, + PropertyArity.LIST, + entProps); + } + + List pi = handler + .getProcessingInstructions(); + List piTargets = new LinkedList<>(); + if (!pi.isEmpty()) { + // Build a property, which consists of a list + // of properties, each of which is an array of + // two String properties, named Target and + // Data respectively. + List piPropList = new ArrayList<>(pi.size()); + for (ProcessingInstructionInfo pistr : pi) { + Property[] subPropArr = new Property[2]; + // Accumulate targets in a list, so we can tell + // which Notations use them. + // Wait a minute -- what we're doing here can't work!! + // TODO: What's supposed to be happening? + // piTargets.add (subPropArr[0]); + subPropArr[0] = new Property("Target", + PropertyType.STRING, pistr.target); + subPropArr[1] = new Property("Data", + PropertyType.STRING, pistr.data); + piPropList.add(new Property("ProcessingInstruction", + PropertyType.PROPERTY, + PropertyArity.ARRAY, + subPropArr)); + } + procInstProp = new Property("ProcessingInstructions", + PropertyType.PROPERTY, + PropertyArity.LIST, + piPropList); + } + + // Notations property. We list notations only if they're + // "actually used," meaning that they designate either + // the target of a processing instruction or the ndata + // of an unparsed entry which is itself "actually used." + List notations = handler.getNotations(); + if (!notations.isEmpty()) { + List notProps = new ArrayList<>(notations.size()); + ListIterator iter = notations.listIterator(); + List subPropList = new ArrayList<>(3); + while (iter.hasNext()) { + String[] notArray = iter.next(); + String notName = notArray[0]; + // Check for use of Notation before including + // TODO this is implemented wrong! Need to reinvestigate + if (piTargets.contains(notName) + || unparsedNotationNames.contains(notName)) { + // notArray has name, public ID, system ID + subPropList.add(new Property("Name", + PropertyType.STRING, notName)); + if (notArray[1] != null) { + subPropList.add(new Property("PublicID", + PropertyType.STRING, notArray[1])); + } + if (notArray[2] != null) { + subPropList.add(new Property("SystemID", + PropertyType.STRING, notArray[2])); + } + notProps.add(new Property("Notation", + PropertyType.PROPERTY, + PropertyArity.LIST, + subPropList)); + } + } + // Recheck emptiness in case only unprocessed notations were found + if (!notProps.isEmpty()) { + notationsProp = new Property("Notations", + PropertyType.PROPERTY, + PropertyArity.LIST, + notProps); + } + } + + // Now add all the properties we created. + if (namespaceProp != null) { + _propList.add(namespaceProp); + } + if (notationsProp != null) { + _propList.add(notationsProp); + } + if (charRefsProp != null) { + _propList.add(charRefsProp); + } + if (unicodeBlocksProp != null) { + _propList.add(unicodeBlocksProp); + } + if (entitiesProp != null) { + _propList.add(entitiesProp); + } + if (procInstProp != null) { + _propList.add(procInstProp); + } + + List comm = lexHandler.getComments(); + if (!comm.isEmpty()) { + commentProp = new Property("Comments", + PropertyType.STRING, + PropertyArity.LIST, + comm); + } + if (commentProp != null) { + _propList.add(commentProp); + } + + // Check if parse detected invalid XML + if (!handler.isValid()) { + info.setValid(false); + } + + if (info.getWellFormed() == RepInfo.TRUE) { + if (_xhtmlDoctype != null) { + info.setMimeType(_mimeType[2]); + } else { + info.setMimeType(_mimeType[0]); + } + } + + // Add any messages from the parse. + List msgs = handler.getMessages(); + for (Message msg : msgs) { + info.setMessage(msg); + } + + if (info.getVersion() == null) { + info.setVersion("1.0"); + } + + if (_withTextMD) { + _textMD.setMarkup_basis(info.getFormat()); + _textMD.setMarkup_basis_version(info.getVersion()); + Property property = new Property("TextMDMetadata", + PropertyType.TEXTMDMETADATA, + PropertyArity.SCALAR, _textMD); + _propList.add(property); + } + + // Set the checksums in the report if they're calculated + setChecksums(this._ckSummer, info); + + return 0; + } + + /** + * Check if the digital object conforms to this Module's + * internal signature information. + * + * XML is a particularly messy case; in general, there's no + * even moderately good way to check "signatures" without parsing + * the whole file, since the document declaration is optional. + * We provide the user two choices, based on the "s" parameter. + * If 's' is the first character of the module parameter, then + * we look for an XML document declaration, and say there's no + * signature if it's missing. (This can reject well-formed + * XML files, though not valid ones.) Otherwise, if there's no + * document declaration, we parse the whole file. + * + * @param file + * A File object for the object being parsed + * @param stream + * An InputStream, positioned at its beginning, + * which is generated from the object to be parsed + * @param info + * A fresh RepInfo object which will be modified + * to reflect the results of the test + */ + @Override + public void checkSignatures(File file, InputStream stream, RepInfo info) + throws IOException { + _parseFromSig = false; + info.setFormat(_format[0]); + info.setMimeType(_mimeType[0]); + info.setModule(this); + String sigStr = "= sigStr.length()) { + info.setSigMatch(_name); + return; // sig matches + } + } else + break; + } + } catch (IOException e) { + info.setWellFormed(false); + return; + } + if (_sigWantsDecl) { + // No XML declaration, and it's mandatory according to the param. + info.setWellFormed(false); + return; + } + + // No XML signature, but we're allowed to parse the file now. + // This means rewinding back to the start of the file. + int parseIndex = 1; + _parseFromSig = true; // we set the sig match ourselves + while (parseIndex != 0) { + stream.close(); + stream = new FileInputStream(file); + parseIndex = parse(stream, info, parseIndex); + } + if (info.getWellFormed() == RepInfo.TRUE) { + info.setSigMatch(_name); + } + } + + /** + * Converts an int to a 4-digit hex value, e.g., + * 003F or F10A. This is used for Character References. + */ + protected static String intTo4DigitHex(int n) { + StringBuilder sb = new StringBuilder(4); + for (int i = 3; i >= 0; i--) { + int d = (n >> (4 * i)) & 0XF; // extract a nybble + if (d < 10) { + sb.append((char) ('0' + d)); + } else { + sb.append((char) ('A' + (d - 10))); + } + } + return sb.toString(); + } + + /** + * Check that a string contains something other than "[None]". + * + * @param value + * string to test + * @return + * true if the string contains something + * other than "[None]", false otherwise + */ + protected static boolean isNotEmpty(String value) { + return ((value != null) && (value.length() != 0) + && !("[None]".equals(value))); + } + + /** + * Parse a "schema" configuration argument and map the schema + * location URI to a local file after validating both components. + * + * @param param + * a module parameter string of the form + * "schema=[location-URI];[local-path]" + */ + private void addLocalSchema(String param) { + int eq = param.indexOf('='); + int semi = param.indexOf(';'); + try { + String uriParam = param.substring(eq + 1, semi); + String localParam = param.substring(semi + 1); + try { + String locationUri = new URI(uriParam).toString(); + File localFile = new File(localParam); + if (localFile.exists()) { + _localSchemas.put(locationUri, localFile); + } else { + _logger.warning("Ignoring module parameter with " + + "unresolvable path: \"" + localParam + "\""); + } + } catch (URISyntaxException use) { + _logger.warning("Ignoring module parameter with " + + "invalid URI syntax: \"" + uriParam + "\""); + } + } catch (IndexOutOfBoundsException ioobe) { + _logger.warning("Ignoring malformed module parameter \"" + + param + "\""); + } + } } diff --git a/pom.xml b/pom.xml index 4ba3fdf34..7d541f0b7 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ org.openpreservation.jhove jhove - 1.29.0-SNAPSHOT + 1.30.0-RC1 pom JHOVE - JSTOR/Harvard Object Validation Environment