diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index a5d12ac3..c08e0af6 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -1031,30 +1031,142 @@ internal PdfTrailer ReadTrailer() throw new Exception("The StartXRef table could not be found, the file cannot be opened."); ReadSymbol(Symbol.StartXRef); - _lexer.Position = ReadInteger(); + int startxref = _lexer.Position = ReadInteger(); + + // Must be before the first 'goto valid_xref;' statement. + int xref_offset = 0; + + // Check for valid startxref + if (IsValidXref()) + { + goto valid_xref; + } + + // If we reach this point, we have an invalid startxref + // First look for bytes preceding "%PDF-". Some pdf producers ignore these. + if (length >= 1024) + { + // "%PDF-" should be in this range + string header = _lexer.ReadRawString(0, 1024); + idx = header.IndexOf("%PDF-", StringComparison.Ordinal); + } + else + { + string header = _lexer.ReadRawString(0, length); + idx = header.IndexOf("%PDF-", StringComparison.Ordinal); + } + + if (idx > 0) + { + //_lexer.ByteOffset = idx; + _lexer.Position = startxref + idx; + if (IsValidXref()) + { + xref_offset = idx; + goto valid_xref; + } + } + + valid_xref: + _lexer.Position = startxref + xref_offset; + + // Read all trailers. + while (true) + { + PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable, xref_offset); + // 1st trailer seems to be the best. + if (_document._trailer == null) + _document._trailer = trailer; + int prev = trailer.Elements.GetInteger(PdfTrailer.Keys.Prev); + if (prev == 0) + break; + //if (prev > lexer.PdfLength) + // break; + _lexer.Position = prev; + } + + return _document._trailer; + } - // Read all trailers. + /// + /// Checks that the current _lexer location is a valid xref. + /// + /// + private bool IsValidXref() + { + int length = _lexer.PdfLength; + int position = _lexer.Position; + // Make sure not inside a stream. + + string content = ""; + int content_pos = position; while (true) { - PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable); - // 1st trailer seems to be the best. - if (_document._trailer == null) - _document._trailer = trailer; - int prev = trailer.Elements.GetInteger(PdfTrailer.Keys.Prev); - if (prev == 0) + // look for stream and endstream in 1k chunks. + int read_length = Math.Min(1024, length - content_pos); + content += _lexer.ReadRawString(content_pos, read_length); + + int ss = content.IndexOf("stream", StringComparison.Ordinal); + int es = content.IndexOf("endstream", StringComparison.Ordinal); + int eof = content.IndexOf("%%EOF", StringComparison.Ordinal); + + if (ss != es) + { + if (ss == -1) + { + if (eof != -1 && eof < es) + break; + else + return false; + } + else if (es == -1) + break; + else if (ss < es) + break; + else if (ss > es) + { + if (eof != -1 && eof < ss && eof < es) + break; + else + return false; + } + } + + if (eof != -1) break; - //if (prev > lexer.PdfLength) - // break; - _lexer.Position = prev; + + content_pos = content_pos + read_length; + if (content_pos + read_length >= length) + { + // reached the end of the document without finding either. + break; + } } - return _document._trailer; + _lexer.Position = position; + + Symbol symbol = ScanNextToken(); + if (symbol == Symbol.XRef) + { + return true; + } + + if (symbol == Symbol.Integer) + { + // Just because we have an integer, doesn't mean the startxref is actually valid + if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) + { + return true; + } + } + + return false; } /// /// Reads cross reference table(s) and trailer(s). /// - private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) + private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset) { Debug.Assert(xrefTable != null); @@ -1072,7 +1184,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) int length = ReadInteger(); for (int id = start; id < start + length; id++) { - int position = ReadInteger(); + int position = ReadInteger() + xrefOffset; int generation = ReadInteger(); ReadSymbol(Symbol.Keyword); string token = _lexer.Token; @@ -1114,10 +1226,10 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) return null; } - /// - /// Reads cross reference stream(s). - /// - private PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable) + /// + /// Reads cross reference stream(s). + /// + private PdfTrailer ReadXRefStream(PdfCrossReferenceTable xrefTable) { // Read cross reference stream. //Debug.Assert(_lexer.Symbol == Symbol.Integer);