From 900b1ddeb19c90d632e51773bd3a30d788ac5f70 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Thu, 2 Nov 2017 10:45:43 -0700 Subject: [PATCH 1/4] Fixes invalid byte offset values for startxref and xref object offsets when bytes precede '%PDF-'. --- src/PdfSharp/Pdf.IO/Parser.cs | 119 +++++++++++++++++++++++++--------- 1 file changed, 90 insertions(+), 29 deletions(-) diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index a5d12ac3..179b94e7 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -1031,30 +1031,91 @@ internal PdfTrailer ReadTrailer() throw new Exception("The StartXRef table could not be found, the file cannot be opened."); ReadSymbol(Symbol.StartXRef); - _lexer.Position = ReadInteger(); - - // Read all trailers. - while (true) - { - PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable); - // 1st trailer seems to be the best. - if (_document._trailer == null) - _document._trailer = trailer; - int prev = trailer.Elements.GetInteger(PdfTrailer.Keys.Prev); - if (prev == 0) - break; - //if (prev > lexer.PdfLength) - // break; - _lexer.Position = prev; - } - - return _document._trailer; - } - - /// - /// Reads cross reference table(s) and trailer(s). - /// - private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) + int startxref = _lexer.Position = ReadInteger(); + + // Must be before the first 'goto valid_xref;' statement. + int xref_offset = 0; + + // Check for valid startxref + if (IsValidXref()) + { + goto valid_xref; + } + + // If we reach this point, we have an invalid startxref + // First look for bytes preceding "%PDF-". Some pdf producers ignore these. + if (length >= 1024) + { + // "%PDF-" should be in this range + string header = _lexer.ReadRawString(0, 1024); + idx = header.IndexOf("%PDF-", StringComparison.Ordinal); + } + else + { + string header = _lexer.ReadRawString(0, length); + idx = header.IndexOf("%PDF-", StringComparison.Ordinal); + } + + if (idx > 0) + { + //_lexer.ByteOffset = idx; + _lexer.Position = startxref + idx; + if (IsValidXref()) + { + xref_offset = idx; + goto valid_xref; + } + } + + valid_xref: + _lexer.Position = startxref + xref_offset; + + // Read all trailers. + while (true) + { + PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable, xref_offset); + // 1st trailer seems to be the best. + if (_document._trailer == null) + _document._trailer = trailer; + int prev = trailer.Elements.GetInteger(PdfTrailer.Keys.Prev); + if (prev == 0) + break; + //if (prev > lexer.PdfLength) + // break; + _lexer.Position = prev; + } + + return _document._trailer; + } + + /// + /// Checks that the current _lexer location is a valid xref. + /// + /// + private bool IsValidXref() + { + Symbol symbol = ScanNextToken(); + if (symbol == Symbol.XRef) + { + return true; + } + + if (symbol == Symbol.Integer) + { + // Just because we have an integer, doesn't mean the startxref is actually valid + if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) + { + return true; + } + } + + return false; + } + + /// + /// Reads cross reference table(s) and trailer(s). + /// + private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset) { Debug.Assert(xrefTable != null); @@ -1072,7 +1133,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) int length = ReadInteger(); for (int id = start; id < start + length; id++) { - int position = ReadInteger(); + int position = ReadInteger() + xrefOffset; int generation = ReadInteger(); ReadSymbol(Symbol.Keyword); string token = _lexer.Token; @@ -1114,10 +1175,10 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) return null; } - /// - /// Reads cross reference stream(s). - /// - private PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable) + /// + /// Reads cross reference stream(s). + /// + private PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable) { // Read cross reference stream. //Debug.Assert(_lexer.Symbol == Symbol.Integer); From 0e54078cb9343ac029c3f502412b5c776040987f Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Mon, 6 Nov 2017 15:25:10 -0800 Subject: [PATCH 2/4] IsValidXref() now checks to see if it is inside a stream. --- src/PdfSharp/Pdf.IO/Parser.cs | 36 +++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index 179b94e7..ebaa6b0b 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -1094,6 +1094,42 @@ internal PdfTrailer ReadTrailer() /// private bool IsValidXref() { + int length = _lexer.PdfLength; + int position = _lexer.Position; + // Make sure not inside a stream. + + string content = ""; + int content_pos = position; + while (true) + { + // look for stream and endstream in 1k chunks. + int read_length = Math.Min(1024, length - content_pos); + content += _lexer.ReadRawString(content_pos, read_length); + + int ss = content.IndexOf("stream", StringComparison.Ordinal); + int es = content.IndexOf("endstream", StringComparison.Ordinal); + + if (ss < es) + { + // Not inside of stream + break; + } + else if (ss > es) + { + // inside of stream + return false; + } + + content_pos = content_pos + read_length; + if (content_pos + read_length >= length) + { + // reached the end of the document without finding either. + break; + } + } + + _lexer.Position = position; + Symbol symbol = ScanNextToken(); if (symbol == Symbol.XRef) { From 3c346c3d89a9174842a92d1cd1d20e82709dc5b8 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Tue, 14 Nov 2017 10:09:30 -0800 Subject: [PATCH 3/4] IsValidXref() no will no longer check if it's inside of a stream beyong an EOF symbol. --- src/PdfSharp/Pdf.IO/Parser.cs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index ebaa6b0b..ed3d40c6 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -1107,17 +1107,21 @@ private bool IsValidXref() content += _lexer.ReadRawString(content_pos, read_length); int ss = content.IndexOf("stream", StringComparison.Ordinal); + int eof = content.IndexOf("%%EOF", StringComparison.Ordinal); int es = content.IndexOf("endstream", StringComparison.Ordinal); - if (ss < es) - { - // Not inside of stream - break; - } - else if (ss > es) + int s = Math.Min(ss, eof); + + if (s != es) { - // inside of stream - return false; + if (s == -1) + return false; + else if (es == -1) + break; + else if (s < es) + break; + else if (s > es) + return false; } content_pos = content_pos + read_length; From 2a6ad48cd0400382d02ada418a6a6679aef07430 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Mon, 11 Dec 2017 13:50:21 -0800 Subject: [PATCH 4/4] Fixed IsValidRef(). Made bad checks. --- src/PdfSharp/Pdf.IO/Parser.cs | 129 ++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index ed3d40c6..c08e0af6 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -1088,74 +1088,85 @@ internal PdfTrailer ReadTrailer() return _document._trailer; } - /// - /// Checks that the current _lexer location is a valid xref. - /// - /// - private bool IsValidXref() - { - int length = _lexer.PdfLength; - int position = _lexer.Position; - // Make sure not inside a stream. - - string content = ""; - int content_pos = position; - while (true) - { - // look for stream and endstream in 1k chunks. - int read_length = Math.Min(1024, length - content_pos); - content += _lexer.ReadRawString(content_pos, read_length); + /// + /// Checks that the current _lexer location is a valid xref. + /// + /// + private bool IsValidXref() + { + int length = _lexer.PdfLength; + int position = _lexer.Position; + // Make sure not inside a stream. - int ss = content.IndexOf("stream", StringComparison.Ordinal); - int eof = content.IndexOf("%%EOF", StringComparison.Ordinal); - int es = content.IndexOf("endstream", StringComparison.Ordinal); + string content = ""; + int content_pos = position; + while (true) + { + // look for stream and endstream in 1k chunks. + int read_length = Math.Min(1024, length - content_pos); + content += _lexer.ReadRawString(content_pos, read_length); - int s = Math.Min(ss, eof); + int ss = content.IndexOf("stream", StringComparison.Ordinal); + int es = content.IndexOf("endstream", StringComparison.Ordinal); + int eof = content.IndexOf("%%EOF", StringComparison.Ordinal); - if (s != es) - { - if (s == -1) - return false; - else if (es == -1) - break; - else if (s < es) - break; - else if (s > es) - return false; - } + if (ss != es) + { + if (ss == -1) + { + if (eof != -1 && eof < es) + break; + else + return false; + } + else if (es == -1) + break; + else if (ss < es) + break; + else if (ss > es) + { + if (eof != -1 && eof < ss && eof < es) + break; + else + return false; + } + } - content_pos = content_pos + read_length; - if (content_pos + read_length >= length) - { - // reached the end of the document without finding either. - break; - } - } + if (eof != -1) + break; - _lexer.Position = position; + content_pos = content_pos + read_length; + if (content_pos + read_length >= length) + { + // reached the end of the document without finding either. + break; + } + } - Symbol symbol = ScanNextToken(); - if (symbol == Symbol.XRef) - { - return true; - } + _lexer.Position = position; - if (symbol == Symbol.Integer) - { - // Just because we have an integer, doesn't mean the startxref is actually valid - if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) - { - return true; - } - } + Symbol symbol = ScanNextToken(); + if (symbol == Symbol.XRef) + { + return true; + } - return false; - } + if (symbol == Symbol.Integer) + { + // Just because we have an integer, doesn't mean the startxref is actually valid + if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) + { + return true; + } + } - /// - /// Reads cross reference table(s) and trailer(s). - /// - private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset) + return false; + } + + /// + /// Reads cross reference table(s) and trailer(s). + /// + private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset) { Debug.Assert(xrefTable != null);