diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 00000000..066d959f --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,50 @@ +# ASP.NET Core (.NET Framework) +# Build and test ASP.NET Core projects targeting the full .NET Framework. +# Add steps that publish symbols, save build artifacts, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/dotnet-core + +trigger: +- Release + +pool: + vmImage: 'windows-latest' + +variables: + solution: '**/*.sln' + buildPlatform: 'Any CPU' + buildConfiguration: 'Release' + +steps: +- task: NuGetToolInstaller@1 + +- task: NuGetCommand@2 + inputs: + restoreSolution: '$(solution)' + +- task: VSBuild@1 + inputs: + solution: '$(solution)' + msbuildArgs: '/p:DeployOnBuild=true /p:WebPublishMethod=Package /p:PackageAsSingleFile=true /p:SkipInvalidConfigurations=true /p:DesktopBuildPackageLocation="$(build.artifactStagingDirectory)\WebApp.zip" /p:DeployIisAppPath="Default Web Site"' + platform: '$(buildPlatform)' + configuration: '$(buildConfiguration)' + +- task: VSTest@2 + inputs: + platform: '$(buildPlatform)' + configuration: '$(buildConfiguration)' + +- task: NuGetCommand@2 + inputs: + command: 'pack' + packagesToPack: '**/*.csproj' + versioningScheme: 'byPrereleaseNumber' + majorVersion: '1' + minorVersion: '0' + patchVersion: '0' + +- task: NuGetCommand@2 + inputs: + command: 'push' + packagesToPush: '$(Build.ArtifactStagingDirectory)/**/*.nupkg;!$(Build.ArtifactStagingDirectory)/**/*.symbols.nupkg' + nuGetFeedType: 'internal' + publishVstsFeed: 'b23e5b36-79b9-4a22-a765-20dec00e216d' diff --git a/src/PdfSharp-gdi/PdfSharp-gdi.csproj b/src/PdfSharp-gdi/PdfSharp-gdi.csproj index 941eee61..4ab868f8 100644 --- a/src/PdfSharp-gdi/PdfSharp-gdi.csproj +++ b/src/PdfSharp-gdi/PdfSharp-gdi.csproj @@ -98,6 +98,28 @@ none AllRules.ruleset + + true + bin\x64\Debug\ + TRACE;DEBUG;GDI;UseGdiObjects + 285212672 + 4096 + x64 + default + prompt + AllRules.ruleset + + + bin\x64\Release\ + TRACE;GDI;UseGdiObjects + 285212672 + bin\Release\PdfSharp-gdi.xml + true + true + 4096 + x64 + AllRules.ruleset + System diff --git a/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs b/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs index cd56e94f..3810ca9a 100644 --- a/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs +++ b/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs @@ -75,7 +75,7 @@ public PdfTrailer(PdfCrossReferenceStream trailer) if (id != null) Elements.SetValue(Keys.ID, id); } - + public int Size { get { return Elements.GetInteger(Keys.Size); } @@ -218,6 +218,37 @@ internal void Finish() _document._irefTable.IsUnderConstruction = false; } + /// + /// Constructs the PdfTrailer from a document. + /// + /// the parser used to read the file. + internal void ConstructFromDocument(Parser parser) + { + // TODO - May need to also search for encryption related trailer info + PdfCrossReferenceTable xrefTable = _document._irefTable; + Elements.SetInteger(Keys.Size, xrefTable.ObjectTable.Count); + + // find the root. + PdfDictionary rootToUse = null; + foreach (var reference in xrefTable.AllReferences) + { + PdfObject obj = parser.ReadObject(null, reference.ObjectID, false, false); + if (obj is PdfDictionary dObj) + { + if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog") + { + if (rootToUse == null) + rootToUse = dObj; + else if (dObj.ObjectID.GenerationNumber > rootToUse.ObjectID.GenerationNumber) + rootToUse = dObj; + } + } + } + + if (rootToUse != null) + Elements.SetReference(Keys.Root, rootToUse); + } + /// /// Predefined keys of this dictionary. /// diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index 5bff4193..c0e54b70 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -76,98 +76,138 @@ public int Position } } - /// - /// Reads the next token and returns its type. If the token starts with a digit, the parameter - /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. - /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, - /// the token is set to the object ID followed by the generation number separated by a blank - /// (the 'R' is omitted from the token). - /// - // /// Indicates whether to test the next token if it is a reference. - public Symbol ScanNextToken() + /// + /// Reads the next token and returns its type. If the token starts with a digit, the parameter + /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. + /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, + /// the token is set to the object ID followed by the generation number separated by a blank + /// (the 'R' is omitted from the token). + /// + // /// Indicates whether to test the next token if it is a reference. + public Symbol ScanNextToken() + { + return ScanNextToken(out int location); + } + + /// + /// Reads the next token and returns its type. If the token starts with a digit, the parameter + /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. + /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, + /// the token is set to the object ID followed by the generation number separated by a blank + /// (the 'R' is omitted from the token). + /// + // /// The start position of the next token. + public Symbol ScanNextToken(out int position) + { + Symbol symbol = Symbol.None; + if (!TryScanNextToken(out symbol, out position)) + ParserDiagnostics.HandleUnexpectedCharacter(_nextChar); + return symbol; + } + + public bool TryScanNextToken(out Symbol symbol, out int position) { Again: _token = new StringBuilder(); char ch = MoveToNonWhiteSpace(); - switch (ch) + position = Position; + switch (ch) { case '%': // Eat comments, the parser doesn't handle them //return symbol = ScanComment(); - ScanComment(); + symbol = _symbol = ScanComment(); + // Do not eat EOF + if (symbol == Symbol.Eof) + return true; goto Again; case '/': - return _symbol = ScanName(); - - //case 'R': - // if (Lexer.IsWhiteSpace(nextChar)) - // { - // ScanNextChar(); - // return Symbol.R; - // } - // break; - + symbol = _symbol = ScanName(); + return true; + case '+': //TODO is it so easy? case '-': - return _symbol = ScanNumber(); + symbol = _symbol = ScanNumber(); + return true; case '(': - return _symbol = ScanLiteralString(); + symbol = _symbol = ScanLiteralString(); + return true; case '[': ScanNextChar(true); - return _symbol = Symbol.BeginArray; + symbol = _symbol = Symbol.BeginArray; + return true; case ']': ScanNextChar(true); - return _symbol = Symbol.EndArray; + symbol = _symbol = Symbol.EndArray; + return true; case '<': if (_nextChar == '<') { ScanNextChar(true); ScanNextChar(true); - return _symbol = Symbol.BeginDictionary; + symbol = _symbol = Symbol.BeginDictionary; + return true; } - return _symbol = ScanHexadecimalString(); + symbol = _symbol = ScanHexadecimalString(); + return true; case '>': if (_nextChar == '>') { ScanNextChar(true); ScanNextChar(true); - return _symbol = Symbol.EndDictionary; + symbol = _symbol = Symbol.EndDictionary; + return true; } - ParserDiagnostics.HandleUnexpectedCharacter(_nextChar); - break; + + symbol = _symbol = Symbol.None; + return false; case '.': - return _symbol = ScanNumber(); + symbol = _symbol = ScanNumber(); + return true; } if (char.IsDigit(ch)) #if true_ - return ScanNumberOrReference(); + symbol = ScanNumberOrReference(); + return true; #else if (PeekReference()) - return _symbol = ScanNumber(); + { + symbol = _symbol = ScanNumber(); + return true; + } else - return _symbol = ScanNumber(); + { + symbol = _symbol = ScanNumber(); + return true; + } #endif if (char.IsLetter(ch)) - return _symbol = ScanKeyword(); + { + symbol = _symbol = ScanKeyword(); + return true; + } if (ch == Chars.EOF) - return _symbol = Symbol.Eof; + { + symbol = _symbol = Symbol.Eof; + return true; + } // #??? - - ParserDiagnostics.HandleUnexpectedCharacter(ch); - return _symbol = Symbol.None; + + symbol = _symbol = Symbol.None; + return false; } - + /// /// Reads the raw content of a stream. /// @@ -190,7 +230,77 @@ public byte[] ReadStream(int length) else pos = _idxChar + 1; - _pdfSteam.Position = pos; + // Producer: + // Problem: Incorrect stream length + // Fix: Find the endstream keyword and measure the length + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Producer: + // Problem: Not all pdf producers add a eol marker before endstream + // Fix: double check for endstream without the eol marker + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Producer: + // Problem: Some pdf producers replace the eol marker with a carriage return + // Fix: double check for endstream without the eol marker + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Verify stream length and resolve if bad + string nendstream = $"{'\n'}endstream"; + string rendstream = $"{'\r'}endstream"; + string rnendstream = $"{'\r'}{'\n'}endstream"; + string endstream = "endstream"; + + string postStream = ReadRawString(pos + length, rnendstream.Length); + + bool bValid = postStream.StartsWith(nendstream) || + postStream.StartsWith(rendstream) || + postStream.StartsWith(rnendstream) || + postStream.StartsWith(endstream); // Not all pdf producers add a eol marker before endstream + + if (!bValid) + { + string[] endstreamValues = { nendstream, rendstream, endstream }; + + int IndexOfEndStream(string val) + { + // Find the smallest value + int offset = -1; + + foreach (var es in endstreamValues) + { + int o = val.IndexOf(es, StringComparison.Ordinal); + if (o < offset || offset == -1) + { + offset = o; + } + } + + return offset; + } + + + // find the first endstream occurrence + // first check to see if it is within the specified stream length. + int idxOffset = IndexOfEndStream(postStream); + if (idxOffset != -1) + { + length = length + idxOffset; + } + + if (idxOffset == -1) + { + // TODO:: read in chunks + postStream = ReadRawString(pos, _pdfLength - pos); + idxOffset = IndexOfEndStream(postStream); + if (idxOffset != -1) + { + length = idxOffset; + } + } + } + + _pdfSteam.Position = pos; byte[] bytes = new byte[length]; int read = _pdfSteam.Read(bytes, 0, length); Debug.Assert(read == length); @@ -247,20 +357,78 @@ public Symbol ScanName() while (true) { char ch = AppendAndScanNextChar(); - if (IsWhiteSpace(ch) || IsDelimiter(ch) || ch == Chars.EOF) - return _symbol = Symbol.Name; - if (ch == '#') - { - ScanNextChar(true); - char[] hex = new char[2]; - hex[0] = _currChar; - hex[1] = _nextChar; - ScanNextChar(true); - // TODO Check syntax - ch = (char)(ushort)int.Parse(new string(hex), NumberStyles.AllowHexSpecifier); - _currChar = ch; - } + if (ch == '#') + { + ScanNextChar(true); + char[] hex = new char[2]; + hex[0] = _currChar; + hex[1] = _nextChar; + ScanNextChar(true); + // TODO Check syntax + ch = (char)(ushort)int.Parse(new string(hex), NumberStyles.AllowHexSpecifier); + _currChar = ch; + continue; + } + + if (IsNameOrCommentDelimiter(ch) || ch == Chars.EOF) + { + return _symbol = Symbol.Name; + } + + if (IsWhiteSpace(ch)) + { + //TODO: Check that the white space is valid. + return _symbol = Symbol.Name; + } + + //Handle invalid delimiters + switch (ch) + { + case '(': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case ')': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '<': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '>': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '[': + //TODO: Not Complete + if (IsWhiteSpace(_nextChar) || IsDelimiter(_nextChar) || char.IsNumber(_nextChar) || _nextChar == '.' || _nextChar == '-' || PeekArrayKeyword()) + { + return _symbol = Symbol.Name; + } + break; + case ']': + //TODO: Not Complete + if (IsWhiteSpace(_nextChar) || IsDelimiter(_nextChar) || _nextChar == Chars.EOF) + { + return _symbol = Symbol.Name; + } + + string tkn = Token; + + int position = Position; + ScanNextChar(true); + MoveToNonWhiteSpace(); + bool isRef = PeekReference(); + Position = position; + _token = new StringBuilder(tkn); + if (isRef) + return _symbol = Symbol.Name; + break; + case '{': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '}': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + } } } @@ -634,20 +802,22 @@ internal char ScanNextChar(bool handleCRLF) // Treat single CR as LF. _currChar = Chars.LF; } - } + //Console.WriteLine(); + } } + //Console.Write(_currChar); return _currChar; } - ///// - ///// Resets the current token to the empty string. - ///// - //void ClearToken() - //{ - // _token.Length = 0; - //} + ///// + ///// Resets the current token to the empty string. + ///// + //void ClearToken() + //{ + // _token.Length = 0; + //} - bool PeekReference() + bool PeekReference() { // A Reference has the form "nnn mmm R". The implementation of the parser used a // reduce/shift algorithm in the first place. But this case is the only one we need to @@ -695,6 +865,39 @@ bool PeekReference() return false; } + bool PeekArrayKeyword() + { + StringBuilder token = _token; + int position = Position; + ScanNextChar(true); + + //Pretty sure I want to skip any non white space + char ch = MoveToNonWhiteSpace(); + + //reset the _token + _token = new StringBuilder(); + + while (!IsWhiteSpace(ch) && !IsDelimiter(ch)) + { + ch = AppendAndScanNextChar(); + } + + bool b_is_keyword = false; + switch (_token.ToString()) + { + case "null": + case "true": + case "false": + b_is_keyword = true; + break; + } + + Position = position; + _token = token; + + return b_is_keyword; + } + /// /// Appends current character to the token and reads next one. /// @@ -882,10 +1085,24 @@ internal static bool IsDelimiter(char ch) return false; } - /// - /// Gets the length of the PDF output. - /// - public int PdfLength + /// + /// Indicates whether the specified character is a PDF delimiter character. + /// + internal static bool IsNameOrCommentDelimiter(char ch) + { + switch (ch) + { + case '/': + case '%': + return true; + } + return false; + } + + /// + /// Gets the length of the PDF output. + /// + public int PdfLength { get { return _pdfLength; } } diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index 07b353a9..94bc7d20 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -80,6 +80,22 @@ public int MoveToObject(PdfObjectID objectID) return _lexer.Position = position; } + /// + /// Tries to set PDF input stream position to the specified object. + /// + public bool TryMoveToObject(PdfObjectID objectID, out int position) + { + position = _document._irefTable[objectID].Position; + if (position == -1) + { + position = _lexer.Position; + return false; + } + + _lexer.Position = position; + return true; + } + public Symbol Symbol { get { return _lexer.Symbol; } @@ -118,7 +134,8 @@ public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool incl int generationNumber = objectID.GenerationNumber; if (!fromObjecStream) { - MoveToObject(objectID); + if (!TryMoveToObject(objectID, out int position)) + return null; objectNumber = ReadInteger(); generationNumber = ReadInteger(); } @@ -261,49 +278,35 @@ public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool incl ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); break; } - symbol = ScanNextToken(); - if (symbol == Symbol.BeginStream) + + int revert_pos = _lexer.Position; + + ParserState state = SaveState(); + TryScanNextToken(out symbol); + if (symbol == Symbol.BeginStream || symbol == Symbol.None) { + if (symbol == Symbol.None) + { + // Failed to get a proper symbol + // probably missing "stream" token + RestoreState(state); + } + PdfDictionary dict = (PdfDictionary)pdfObject; Debug.Assert(checkForStream, "Unexpected stream..."); -#if true_ - ReadStream(dict); -#else + int length = GetStreamLength(dict); byte[] bytes = _lexer.ReadStream(length); -#if true_ - if (dict.Elements.GetString("/Filter") == "/FlateDecode") - { - if (dict.Elements["/Subtype"] == null) - { - try - { - byte[] decoded = Filtering.FlateDecode.Decode(bytes); - if (decoded.Length == 0) - goto End; - string pageContent = Filtering.FlateDecode.DecodeToString(bytes); - if (pageContent.Length > 100) - pageContent = pageContent.Substring(pageContent.Length - 100); - pageContent.GetType(); - bytes = decoded; - dict.Elements.Remove("/Filter"); - dict.Elements.SetInteger("/Length", bytes.Length); - } - catch - { - } - } - End: ; - } -#endif + PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); dict.Stream = stream; - ReadSymbol(Symbol.EndStream); - symbol = ScanNextToken(); -#endif + + revert_pos = _lexer.Position; + while ((symbol = ScanNextToken()) == Symbol.EndStream); } - if (!fromObjecStream && symbol != Symbol.EndObj) - ParserDiagnostics.ThrowParserException(PSSR.UnexpectedToken(_lexer.Token)); + if (!fromObjecStream && symbol != Symbol.EndObj) + _lexer.Position = revert_pos; + return pdfObject; } @@ -322,7 +325,7 @@ private void ReadStream(PdfDictionary dict) Debug.Assert(dict.Stream == null, "Dictionary already has a stream."); dict.Stream = stream; ReadSymbol(Symbol.EndStream); - ScanNextToken(); + while (ScanNextToken() == Symbol.EndStream); } // HACK: Solve problem more general. @@ -339,11 +342,47 @@ private int GetStreamLength(PdfDictionary dict) if (reference != null) { ParserState state = SaveState(); - object length = ReadObject(null, reference.ObjectID, false, false); + object pdf_obj = ReadObject(null, reference.ObjectID, false, false); RestoreState(state); - int len = ((PdfIntegerObject)length).Value; - dict.Elements["/Length"] = new PdfInteger(len); - return len; + + + + + int len = -1; + if (pdf_obj is PdfIntegerObject length_obj) + { + len = length_obj.Value; + } + // For whatever reason, ReadObject() did not return a valid PdfIntegerObject + else + { + // Read 1k chunks until we find an "endstream" symbol + string content = ""; + int read_pos = _lexer.Position; + int se = -1; + while (true) + { + int read_len = Math.Min(_lexer.PdfLength - read_pos, 1024); + content += _lexer.ReadRawString(read_pos, read_len); + read_pos += 1024; + + se = content.IndexOf("endstream", StringComparison.Ordinal); + if (se != -1) + { + len = se - 2; // By spec, the stream should start on a new line. remove crlf chars from the count. + break; + } + + if (read_pos >= _lexer.PdfLength) + break; + } + } + + if (len != -1) + { + dict.Elements["/Length"] = new PdfInteger(len); + return len; + } } throw new InvalidOperationException("Cannot retrieve stream length."); } @@ -537,10 +576,41 @@ private void ParseObject(Symbol stop) //case Symbol.StartXRef: //case Symbol.Eof: default: - ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); - SkipCharsUntil(stop); - return; - } + // Any Keyword can be treated as a literal string. + switch (stop) + { + case Symbol.EndArray: + // Arrays are space delimited. + while (true) + { + char ch = _lexer.AppendAndScanNextChar(); + if (Lexer.IsWhiteSpace(ch) || ch == Chars.EOF || ch == Chars.BracketRight) + { + _stack.Shift(new PdfString(_lexer.Token, PdfStringFlags.RawEncoding)); + break; + } + } + break; + case Symbol.EndDictionary: + // Dictionaries are key value pairs where key must be a name. + while (true) + { + char ch = _lexer.AppendAndScanNextChar(); + if (ch == Chars.Slash || ch == Chars.Greater) + { + _stack.Shift(new PdfString(_lexer.Token, PdfStringFlags.RawEncoding)); + break; + } + } + break; + default: + ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); + SkipCharsUntil(stop); + break; + } + + return; + } } ParserDiagnostics.ThrowParserException("Unexpected end of file."); // TODO L10N using PSSR. } @@ -549,6 +619,16 @@ private Symbol ScanNextToken() { return _lexer.ScanNextToken(); } + + private Symbol ScanNextToken(out int position) + { + return _lexer.ScanNextToken(out position); + } + + private bool TryScanNextToken(out Symbol symbol) + { + return _lexer.TryScanNextToken(out symbol, out int position); + } private Symbol ScanNextToken(out string token) { @@ -1030,11 +1110,29 @@ internal PdfTrailer ReadTrailer() if (idx == -1) throw new Exception("The StartXRef table could not be found, the file cannot be opened."); - ReadSymbol(Symbol.StartXRef); - _lexer.Position = ReadInteger(); - - // Read all trailers. - while (true) + Symbol s = ReadSymbol(Symbol.StartXRef); + _lexer.Position = ReadInteger(); + + // Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154) + // Problem: certificate data added to the start of file. Invalid startxref byte offset + // Fix: We could search for the a valid xref table but all byte offsets are probably incorrect. + // Probably best to just recreate the xref table. + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.5.5 + + // Check for valid startxref + if (!IsValidXref()) + { + PdfTrailer trailer; + bool bSuccess = TryRecreateXRefTableAndTrailer(out trailer, _document); + if (!bSuccess) + throw new Exception("Could not recreate the xref table or trailer."); + + _document._trailer = trailer; + return _document._trailer; + } + + // Read all trailers. + while (true) { PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable); // 1st trailer seems to be the best. @@ -1052,9 +1150,191 @@ internal PdfTrailer ReadTrailer() } /// - /// Reads cross reference table(s) and trailer(s). + /// Checks that the current _lexer location is a valid xref. /// - private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) + /// + private bool IsValidXref() + { + int position = _lexer.Position; + try + { + Symbol symbol = ScanNextToken(); + if (symbol == Symbol.XRef) // xref table + { + _lexer.Position = position; + return true; + } + + if (symbol == Symbol.Integer) // Linearization parameter dictionary + { + // Just because we have an integer, doesn't mean the startxref is actually valid + if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) + { + _lexer.Position = position; + return true; + } + } + + _lexer.Position = position; + return false; + } + catch + { + _lexer.Position = position; + return false; + } + } + + private bool TryRecreateXRefTableAndTrailer(out PdfTrailer trailer, PdfDocument document) + { + PdfCrossReferenceTable xrefTable = document._irefTable; + trailer = null; + int length = _lexer.PdfLength; + + // because some pdf producers put random info before the header, we need to find a proper starting position. + // i.e. Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154) + int startIdx = -1; + string contents = ""; + for (int i = 0, pos = 0; startIdx == -1 && pos < length; i++, pos = 1024 * i) + { + int len = Math.Min(1024, length - pos); + contents = $"{contents}{_lexer.ReadRawString(pos, len)}"; + startIdx = contents.IndexOf("%PDF-1.", StringComparison.Ordinal); + } + + if (startIdx == -1) + return false; + + // Don't look past the last %%EOF marker + int endIdx = -1; + contents = ""; + for (int i = 1; endIdx == -1; i++) + { + int pos = length - (1024 * i); + int len = 1024; + + if (pos < 0) + { + len = len + pos; + pos = 0; + } + + contents = $"{_lexer.ReadRawString(pos, len)}{contents}"; + endIdx = contents.LastIndexOf("%%EOF", StringComparison.Ordinal); + if (endIdx != -1) + endIdx = length - contents.Length + endIdx; + + if (pos == 0) + break; + } + + if (endIdx == -1) + return false; + + endIdx = endIdx + 5; // This should be where Eof char is + + // Recreate the xref table. + // + // When symbol == Symbol.Obj + // [0] - generation + // [1] - id + TokenInfo[] token_stack = new TokenInfo[2]; + + _lexer.Position = startIdx; + while (true) + { + Symbol symbol = ScanNextToken(out int position); + if (symbol == Symbol.Eof) + { + // Check if it's the last EOF + if (_lexer.Position >= endIdx) + break; // This is the end of the file. + } + + // we need to skip over streams entirely + if (symbol == Symbol.BeginStream) + { + // We're not reading any data from the object so wee need to find endstream + int pos = _lexer.Position; + string trail = ""; + int trail_pos = pos; + while (true) + { + // look for endstream in 1k chunks. + int trail_length = Math.Min(1024, length - trail_pos); + trail += _lexer.ReadRawString(trail_pos, trail_length); + int stop = trail.IndexOf("endstream", StringComparison.Ordinal); + if (stop != -1) + { + _lexer.Position = stop + pos; + break; + } + + trail_pos = trail_pos + trail_length; + if (trail_pos + trail_length >= length) + { + // No endstream was found. + throw new Exception("endstream not found."); + } + } + } + + if (symbol == Symbol.Obj && + token_stack[0].Symbol == Symbol.Integer && + token_stack[1].Symbol == Symbol.Integer) + { + // TODO:: Do we only need the most recent revision? + PdfObjectID objectID = new PdfObjectID(token_stack[1].Number, token_stack[0].Number); + if (!xrefTable.Contains(objectID)) + xrefTable.Add(new PdfReference(objectID, token_stack[1].Position)); + //ReadObject(null, objectID, false, false); // Can't do this because the object value will never be set after + //SkipCharsUntil(Symbol.EndObj); // Can't do this because streams will cause exceptions + } + + token_stack[1] = token_stack[0]; + TokenInfo token_info = new TokenInfo { Symbol = symbol, Position = position }; + if (symbol == Symbol.Integer) + token_info.Number = _lexer.TokenToInteger; + token_stack[0] = token_info; + } + + // find the root. +// foreach (var reference in xrefTable.AllReferences) +// { +// PdfObject obj = ReadObject(null, reference.ObjectID, false, false); +// if (obj is PdfDictionary dObj) +// { +// if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog") +// { +// PdfCatalog catalog = new PdfCatalog(dObj); +// } +// } +// } + + + + + + + + + trailer = new PdfTrailer(_document); + trailer.ConstructFromDocument(this); + + return true; + } + + struct TokenInfo + { + public int Position; + public Symbol Symbol; + public int Number; + } + + /// + /// Reads cross reference table(s) and trailer(s). + /// + private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) { Debug.Assert(xrefTable != null); @@ -1082,6 +1362,14 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) // Skip unused entries. if (token != "n") continue; + + // Mac OS X 10.12.6 Quartz PDFContext fails to mark 0 position entries as free. + // According to spec, we could skip anything less than 8 (e.g. '%PDF-1.n' where n is a digit between 0 and 7 must be the header of a file) + // but anything between 0 and 8 (1-7) could be the indication of a much larger problem. + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.5.2 + // Skip 0 position entries. + if (position == 0) + continue; #if true //!!!new 2018-03-14 begin // Check if the object at the address has the correct ID and generation. @@ -1129,7 +1417,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) } return null; } - + /// /// Checks the x reference table entry. Returns true if everything is correct. /// Return false if the keyword "obj" was found, but ID or Generation are incorrect. diff --git a/src/PdfSharp/Pdf.IO/PdfReader.cs b/src/PdfSharp/Pdf.IO/PdfReader.cs index 08a9f965..21eaef5b 100644 --- a/src/PdfSharp/Pdf.IO/PdfReader.cs +++ b/src/PdfSharp/Pdf.IO/PdfReader.cs @@ -279,7 +279,9 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMode openmode, PdfPasswordProvider passwordProvider) { PdfDocument document; +#if !DEBUG try +#endif { Lexer lexer = new Lexer(stream); document = new PdfDocument(lexer); @@ -439,7 +441,7 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo { Debug.WriteLine(ex.Message); // 4STLA rethrow exception to notify caller. - throw; + //throw; } } else @@ -500,11 +502,13 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo document._irefTable.CheckConsistence(); } } +#if !DEBUG catch (Exception ex) { Debug.WriteLine(ex.Message); throw; } +#endif return document; } diff --git a/src/PdfSharp/Pdf.IO/PdfWriter.cs b/src/PdfSharp/Pdf.IO/PdfWriter.cs index 08071c5c..feafe404 100644 --- a/src/PdfSharp/Pdf.IO/PdfWriter.cs +++ b/src/PdfSharp/Pdf.IO/PdfWriter.cs @@ -245,7 +245,7 @@ public void Write(PdfName value) case '[': case ']': case '#': - break; + break; default: pdf.Append(name[idx]); diff --git a/src/PdfSharp/Pdf/PdfDictionary.cs b/src/PdfSharp/Pdf/PdfDictionary.cs index 428140a1..b24138af 100644 --- a/src/PdfSharp/Pdf/PdfDictionary.cs +++ b/src/PdfSharp/Pdf/PdfDictionary.cs @@ -652,8 +652,10 @@ public PdfRectangle GetRectangle(string key, bool create) array.Elements.GetReal(2), array.Elements.GetReal(3)); this[key] = value; } - else - value = (PdfRectangle)obj; + else if (obj is PdfRectangle rectangle) + { + value = rectangle; + } return value; } diff --git a/src/PdfSharp/Pdf/PdfPages.cs b/src/PdfSharp/Pdf/PdfPages.cs index de2ee441..eb612221 100644 --- a/src/PdfSharp/Pdf/PdfPages.cs +++ b/src/PdfSharp/Pdf/PdfPages.cs @@ -612,8 +612,9 @@ PdfDictionary[] GetKids(PdfReference iref, PdfPage.InheritedValues values, PdfDi PdfPage.InheritValues(kid, values); return new PdfDictionary[] { kid }; } - - if (string.IsNullOrEmpty(type)) + + // If it has kids, it's logically not going to be type page. + if (string.IsNullOrEmpty(type) && !kid.Elements.ContainsKey("/Kids")) { // Type is required. If type is missing, assume it is "/Page" and hope it will work. // TODO Implement a "Strict" mode in PDFsharp and don't do this in "Strict" mode.