Merge pull request #359 from plaisted/master

link annotation fix for PdfDocumentBuilder
2025-04-05 20:55:01 +08:00 · 2021-08-27 07:38:22 -04:00 · 2021-08-27 07:38:22 -04:00 · df3552c38e
commit df3552c38e
parent 905f09b18a 79b26bb434
7 changed files with 111 additions and 10 deletions
--- a/src/UglyToad.PdfPig.Tests/Integration/Documents/outline.pdf
+++ b/src/UglyToad.PdfPig.Tests/Integration/Documents/outline.pdf
--- a/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Writer/PdfDocumentBuilderTests.cs
@ -101,6 +101,33 @@
            }
        }

+        [Fact]
+        public void CanFastAddPageAndStripLinkAnnots()
+        {
+            var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
+            var contents = File.ReadAllBytes(first);
+
+            var annotCount = 0;
+            byte[] results = null;
+            using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
+            using (var output = new PdfDocumentBuilder())
+            {
+                output.AddPage(existing, 1);
+                results = output.Build();
+                var pg = existing.GetPage(1);
+                var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
+                annotCount = annots.Count;
+                Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
+            }
+
+            using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
+            {
+                var pg = rewritten.GetPage(1);
+                var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
+                Assert.Equal(annotCount - 1, annots.Count);
+                Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link);
+            }
+        }

        [Fact]
        public void CanReadSingleBlankPage()
--- a/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/IPdfStreamWriter.cs
@ -8,6 +8,13 @@

    internal interface IPdfStreamWriter : IDisposable
    {
+        /// <summary>
+        /// Sets if the stream writer should attempt to deduplicate objects.
+        /// May not have any affect if <see cref="IPdfStreamWriter"/> does not
+        /// support deduplication.
+        /// </summary>
+        bool AttemptDeduplication { get; set; }
+
        /// <summary>
        /// The underlying stream used by the writer.
        /// </summary>
--- a/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfDedupStreamWriter.cs
@ -23,13 +23,16 @@
            ms.SetLength(0);
            TokenWriter.WriteToken(token, ms);
            var contents = ms.ToArray();
-            if (hashes.TryGetValue(contents, out var value))
+            if (AttemptDeduplication && hashes.TryGetValue(contents, out var value))
            {
                return value;
            }

            var ir = ReserveObjectNumber();
-            hashes.Add(contents, ir);
+            if (AttemptDeduplication)
+            {
+                hashes.Add(contents, ir);
+            }

            offsets.Add(ir.Data, Stream.Position);
            TokenWriter.WriteObject(ir.Data.ObjectNumber, ir.Data.Generation, contents, Stream);
--- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs
@ -330,6 +330,10 @@ namespace UglyToad.PdfPig.Writer
            var streams = new List<PdfPageBuilder.CopiedContentStream>();
            if (pageInfo.Page.TryGet(NameToken.Contents, out IToken contentsToken))
            {
+                // Adobe Acrobat errors if content streams ref'd by multiple pages, turn off
+                // dedup if on to avoid issues
+                var prev = context.AttemptDeduplication;
+                context.AttemptDeduplication = false;
                if (contentsToken is ArrayToken array)
                {
                    foreach (var item in array.Data)
@ -347,6 +351,7 @@ namespace UglyToad.PdfPig.Writer
                    streams.Add(new PdfPageBuilder.CopiedContentStream(
                        WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken));
                }
+                context.AttemptDeduplication = prev;
            }

            // manually copy page dict / resources as we need to modify some
@ -379,15 +384,55 @@ namespace UglyToad.PdfPig.Writer
            {
                if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type)
                {
+                    // don't copy these as they'll be handled during page tree writing
                    continue;
                }

                if (kvp.Key == NameToken.Resources)
                {
+                    // merge parent resources into child
                    CopyResourceDict(kvp.Value, resources);
                    continue;
                }

+                if (kvp.Key == NameToken.Annots)
+                {
+                    var val = kvp.Value;
+                    if (kvp.Value is IndirectReferenceToken ir)
+                    {
+                        val = document.Structure.TokenScanner.Get(ir.Data).Data;
+                    }
+                                                    
+                    if (!(val is ArrayToken arr))
+                    {
+                        // should be array... ignore and remove bad dict
+                        continue;
+                    }
+
+                    // -> ignore links to resolve issues with refencing non-existing pages
+                    // at some point should add support for copying the links if the
+                    // pages are copied as well but for now just fix corruption
+                    var toAdd = new List<IToken>();
+                    foreach (var annot in arr.Data)
+                    {
+                        DictionaryToken tk = GetRemoteDict(annot);
+                        if (tk == null)
+                        {
+                            // malformed
+                            continue;
+                        }
+                        if (tk.TryGet(NameToken.Subtype, out var st) && st is NameToken nm && nm == NameToken.Link)
+                        {
+                            // link -> ignore
+                            continue;
+                        }
+                        toAdd.Add(WriterUtil.CopyToken(context, tk, document.Structure.TokenScanner, refs));
+                    }
+                    // copy rest
+                    copiedPageDict[NameToken.Annots] = new ArrayToken(toAdd);
+                    continue;
+                }
+
                copiedPageDict[NameToken.Create(kvp.Key)] =
                    WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs);
            }
@ -508,10 +553,14 @@ namespace UglyToad.PdfPig.Writer
                    pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize);
                }

+                // Adobe Acrobat errors if content streams ref'd by multiple pages, turn off
+                // dedup if on to avoid issues
+                var prev = context.AttemptDeduplication;
+                context.AttemptDeduplication = false;
+
                var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList();
                if (toWrite.Count == 0)
                {
-                    // write empty
                    pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context);
                }
                else if (toWrite.Count == 1)
@ -529,7 +578,7 @@ namespace UglyToad.PdfPig.Writer
                    }
                    pageDictionary[NameToken.Contents] = new ArrayToken(streams);
                }
-
+                context.AttemptDeduplication = prev;;

                leafChildren[leafNum].Add(context.WriteToken(new DictionaryToken(pageDictionary)));

--- a/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/PdfStreamWriter.cs
@ -30,8 +30,9 @@
            DisposeStream = disposeStream;
        }

-        public Stream Stream { get; protected set; }
-
+        public Stream Stream { get; protected set; }
+        public bool AttemptDeduplication { get; set; } = true;
+
        public virtual IndirectReferenceToken WriteToken(IToken token)
        {
            if (!Initialized)
--- a/src/UglyToad.PdfPig/Writer/TokenWriter.cs
+++ b/src/UglyToad.PdfPig/Writer/TokenWriter.cs
@ -80,7 +80,11 @@
        /// <param name="token">The token to write to the stream.</param>
        /// <param name="outputStream">The stream to write the token to.</param>
        public static void WriteToken(IToken token, Stream outputStream)
-        {
+        {
+            if (token == null)
+            {
+                throw new ArgumentNullException(nameof(token));
+            }
            switch (token)
            {
                case ArrayToken array:
@ -119,7 +123,9 @@
                    break;
                case StringToken stringToken:
                    WriteString(stringToken, outputStream);
-                    break;
+                    break;
+                default:
+                    throw new PdfDocumentFormatException($"Attempted to write token type of {token.GetType()} but was not known.");
            }
        }

@ -294,8 +300,16 @@

            foreach (var pair in dictionary.Data)
            {
-                WriteName(pair.Key, outputStream);
-                WriteToken(pair.Value, outputStream);
+                WriteName(pair.Key, outputStream);
+
+                // handle scenario where PdfPig has a null value under some circumstances
+                if (pair.Value == null)
+                {
+                    WriteToken(NullToken.Instance, outputStream);
+                } else
+                {
+                    WriteToken(pair.Value, outputStream);
+                }
            }

            outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length);