Skip to content

Commit

Permalink
Added support for scanning the document for objects when no valid cro…
Browse files Browse the repository at this point in the history
…ss-reference table is found.
  • Loading branch information
dmester committed Jul 28, 2021
1 parent 3ab55bb commit 20fe90e
Show file tree
Hide file tree
Showing 5 changed files with 250 additions and 11 deletions.
49 changes: 45 additions & 4 deletions src/PdfToSvg/Parsing/DocumentParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ private static long ReadStartXRef(byte[] buffer, int offset, int count)
var eof = Regex.Match(str, "startxref[\0\t\n\f\r ]*([0-9]+)[\0\t\n\f\r ]*%%EOF", RegexOptions.RightToLeft);
if (!eof.Success)
{
throw Exceptions.XRefTableNotFound();
return -1;
}

return long.Parse(eof.Groups[1].Value, CultureInfo.InvariantCulture);
Expand Down Expand Up @@ -400,20 +400,54 @@ maybeObjStream is PdfDictionary objStream &&
return objects;
}

private XRefTable RebuildXRefTable(CancellationToken cancellationToken)
{
if (ObjectScanner.TryScanObjects(lexer.Stream, out var xrefTable, out var trailerPositions, cancellationToken))
{
for (var i = trailerPositions.Count - 1; i >= 0; i--)
{
lexer.Seek(trailerPositions[i], SeekOrigin.Begin);

if (lexer.Read().Token == Token.Trailer)
{
var trailerDict = ReadDictionary();
if (trailerDict.ContainsKey(Names.Root))
{
xrefTable.Trailer = trailerDict;
return xrefTable;
}
}
}

throw Exceptions.MissingTrailer(trailerPositions.First());
}
else
{
throw Exceptions.CorruptPdf();
}
}

public XRefTable ReadXRefTables(long byteOffsetLastXRef, CancellationToken cancellationToken)
{
var xrefTable = new XRefTable();
var trailerSet = false;

var byteOffsets = new HashSet<long>();

if (byteOffsetLastXRef < 0)
{
Log.WriteLine("Missing file trailer in PDF. Indexing all objects.");
return RebuildXRefTable(cancellationToken);
}

while (byteOffsetLastXRef >= 0)
{
cancellationToken.ThrowIfCancellationRequested();

if (!byteOffsets.Add(byteOffsetLastXRef))
{
throw Exceptions.CircularXref(byteOffsetLastXRef);
Log.WriteLine("Circular xref in PDF. Indexing all objects.");
return RebuildXRefTable(cancellationToken);
}

lexer.Seek(byteOffsetLastXRef, SeekOrigin.Begin);
Expand All @@ -440,7 +474,8 @@ public XRefTable ReadXRefTables(long byteOffsetLastXRef, CancellationToken cance
}
else
{
throw Exceptions.MissingTrailer(byteOffsetLastXRef);
Log.WriteLine("Missing trailer after cross-reference table at position {0}. Indexing all objects.", byteOffsetLastXRef);
return RebuildXRefTable(cancellationToken);
}
}
else if (nextLexeme.Token == Token.Integer)
Expand All @@ -462,9 +497,15 @@ public XRefTable ReadXRefTables(long byteOffsetLastXRef, CancellationToken cance
}
else
{
throw Exceptions.MissingTrailer(byteOffsetLastXRef);
Log.WriteLine("Missing trailer after cross-reference stream at position {0}. Indexing all objects.", byteOffsetLastXRef);
return RebuildXRefTable(cancellationToken);
}
}
else
{
Log.WriteLine("Corrupt PDF file. Indexing all objects.");
return RebuildXRefTable(cancellationToken);
}
}

return xrefTable;
Expand Down
9 changes: 2 additions & 7 deletions src/PdfToSvg/Parsing/Exceptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ public static Exception EncryptedPdf()
return new EncryptedPdfException();
}

public static Exception CircularXref(long byteOffsetXRef)
public static Exception CorruptPdf()
{
return new PdfParserException("Circular xref in pdf.", byteOffsetXRef);
return new PdfParserException("The PDF file is corrupt and could not be read.", 0);
}

public static Exception MissingTrailer(long byteOffsetXRef)
Expand Down Expand Up @@ -57,11 +57,6 @@ public static Exception HeaderNotFound()
return new PdfParserException("The specified file is not a valid PDF file. No file header was found.", 0);
}

public static Exception XRefTableNotFound()
{
return new PdfParserException("The specified file is not a valid PDF file. No XRef table was found.", 0);
}

public static Exception UnexpectedCharacter(BufferedReader reader, char unexpectedChar)
{
var errorPosition = reader.Position;
Expand Down
194 changes: 194 additions & 0 deletions src/PdfToSvg/Parsing/ObjectScanner.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
// Copyright (c) PdfToSvg.NET contributors.
// https://github.com/dmester/pdftosvg.net
// Licensed under the MIT License.

using PdfToSvg.DocumentModel;
using PdfToSvg.IO;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;

namespace PdfToSvg.Parsing
{
internal static class ObjectScanner
{
private enum ScanToken
{
Obj,
EndObj,
Stream,
EndStream,
Trailer,
}

private enum ParserState
{
None,
InObject,
InStream,
AfterStream,
}

private class ScanLexeme
{
public ScanToken Token;
public long Position;

public int ObjectNumber;
public int Generation;

public override string ToString()
{
if (Token == ScanToken.Obj)
{
return $"obj {ObjectNumber} {Generation}";
}

return Token.ToString().ToLowerInvariant();
}
}

private class PositionComparer : IEqualityComparer<ScanLexeme>
{
public bool Equals(ScanLexeme x, ScanLexeme y)
{
return x.Position == y.Position;
}

public int GetHashCode(ScanLexeme obj)
{
return unchecked((int)obj.Position);
}
}

private static List<ScanLexeme> ScanStream(Stream stream, CancellationToken cancellationToken)
{
var bufferStartPosition = 0L;
var buffer = new byte[8096];
var bufferLength = 0;

var lexemes = new List<ScanLexeme>();

int read;
do
{
if (bufferLength > 256)
{
Buffer.BlockCopy(buffer, bufferLength - 128, buffer, 0, 128);
bufferStartPosition += bufferLength - 128;
bufferLength = 128;
}

read = stream.ReadAll(buffer, bufferLength, buffer.Length - bufferLength);
bufferLength += read;

if (read > 0)
{
var bufferText = Encoding.ASCII.GetString(buffer, 0, bufferLength);

foreach (Match match in Regex.Matches(bufferText, "[\r\n] {0,6}(stream\\b|endstream\\b|endobj\\b|trailer\\s{0,10}<<|(\\d{1,8}) (\\d{1,8}) obj\\b)"))
{
var value = match.Groups[1].Value;
var lexeme = new ScanLexeme();

lexeme.Position = bufferStartPosition + match.Groups[1].Index;

if (value.StartsWith("stream"))
{
lexeme.Token = ScanToken.Stream;
}
else if (value.StartsWith("endstream"))
{
lexeme.Token = ScanToken.EndStream;
}
else if (value.StartsWith("endobj"))
{
lexeme.Token = ScanToken.EndObj;
}
else if (value.StartsWith("trailer"))
{
lexeme.Token = ScanToken.Trailer;
}
else
{
lexeme.Token = ScanToken.Obj;
lexeme.ObjectNumber = int.Parse(match.Groups[2].Value, CultureInfo.InvariantCulture);
lexeme.Generation = int.Parse(match.Groups[3].Value, CultureInfo.InvariantCulture);
}

lexemes.Add(lexeme);
}
}

cancellationToken.ThrowIfCancellationRequested();
}
while (read > 0);

return lexemes;
}

public static bool TryScanObjects(Stream stream, out XRefTable xrefs, out List<long> trailerPositions, CancellationToken cancellationToken)
{
stream.Position = 0;

var lexemes = ScanStream(stream, cancellationToken);
var state = ParserState.None;

xrefs = new XRefTable();

trailerPositions = new List<long>();

foreach (var lexeme in lexemes.Distinct(new PositionComparer()))
{
switch (lexeme.Token)
{
case ScanToken.Obj:
if (state != ParserState.InStream)
{
xrefs.Add(new XRef
{
ByteOffset = lexeme.Position,
ObjectNumber = lexeme.ObjectNumber,
Generation = lexeme.Generation,
Type = XRefEntryType.NotFree,
});
state = ParserState.InObject;
}
break;
case ScanToken.Stream:
if (state == ParserState.InObject)
{
state = ParserState.InStream;
}
break;
case ScanToken.EndStream:
if (state == ParserState.InStream)
{
state = ParserState.AfterStream;
}
break;
case ScanToken.EndObj:
if (state == ParserState.InStream)
{
state = ParserState.None;
}
break;
case ScanToken.Trailer:
if (state != ParserState.InStream)
{
trailerPositions.Add(lexeme.Position);
}
break;
}
}

return trailerPositions.Count > 0 && xrefs.Count > 0;
}

}
}
9 changes: 9 additions & 0 deletions tests/TestFiles/Own/expected/missing-xref.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/TestFiles/Own/input/missing-xref.pdf
Binary file not shown.

0 comments on commit 20fe90e

Please sign in to comment.