Merge pull request #359 from plaisted/master

link annotation fix for PdfDocumentBuilder
This commit is contained in:
Eliot Jones 2021-08-27 07:38:22 -04:00 committed by GitHub
commit df3552c38e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 111 additions and 10 deletions

View File

@ -101,6 +101,33 @@
} }
} }
[Fact]
public void CanFastAddPageAndStripLinkAnnots()
{
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
var contents = File.ReadAllBytes(first);
var annotCount = 0;
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
{
output.AddPage(existing, 1);
results = output.Build();
var pg = existing.GetPage(1);
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
annotCount = annots.Count;
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
}
using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
{
var pg = rewritten.GetPage(1);
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
Assert.Equal(annotCount - 1, annots.Count);
Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link);
}
}
[Fact] [Fact]
public void CanReadSingleBlankPage() public void CanReadSingleBlankPage()

View File

@ -8,6 +8,13 @@
internal interface IPdfStreamWriter : IDisposable internal interface IPdfStreamWriter : IDisposable
{ {
/// <summary>
/// Sets if the stream writer should attempt to deduplicate objects.
/// May not have any affect if <see cref="IPdfStreamWriter"/> does not
/// support deduplication.
/// </summary>
bool AttemptDeduplication { get; set; }
/// <summary> /// <summary>
/// The underlying stream used by the writer. /// The underlying stream used by the writer.
/// </summary> /// </summary>

View File

@ -23,13 +23,16 @@
ms.SetLength(0); ms.SetLength(0);
TokenWriter.WriteToken(token, ms); TokenWriter.WriteToken(token, ms);
var contents = ms.ToArray(); var contents = ms.ToArray();
if (hashes.TryGetValue(contents, out var value)) if (AttemptDeduplication && hashes.TryGetValue(contents, out var value))
{ {
return value; return value;
} }
var ir = ReserveObjectNumber(); var ir = ReserveObjectNumber();
hashes.Add(contents, ir); if (AttemptDeduplication)
{
hashes.Add(contents, ir);
}
offsets.Add(ir.Data, Stream.Position); offsets.Add(ir.Data, Stream.Position);
TokenWriter.WriteObject(ir.Data.ObjectNumber, ir.Data.Generation, contents, Stream); TokenWriter.WriteObject(ir.Data.ObjectNumber, ir.Data.Generation, contents, Stream);

View File

@ -330,6 +330,10 @@ namespace UglyToad.PdfPig.Writer
var streams = new List<PdfPageBuilder.CopiedContentStream>(); var streams = new List<PdfPageBuilder.CopiedContentStream>();
if (pageInfo.Page.TryGet(NameToken.Contents, out IToken contentsToken)) if (pageInfo.Page.TryGet(NameToken.Contents, out IToken contentsToken))
{ {
// Adobe Acrobat errors if content streams ref'd by multiple pages, turn off
// dedup if on to avoid issues
var prev = context.AttemptDeduplication;
context.AttemptDeduplication = false;
if (contentsToken is ArrayToken array) if (contentsToken is ArrayToken array)
{ {
foreach (var item in array.Data) foreach (var item in array.Data)
@ -347,6 +351,7 @@ namespace UglyToad.PdfPig.Writer
streams.Add(new PdfPageBuilder.CopiedContentStream( streams.Add(new PdfPageBuilder.CopiedContentStream(
WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken)); WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken));
} }
context.AttemptDeduplication = prev;
} }
// manually copy page dict / resources as we need to modify some // manually copy page dict / resources as we need to modify some
@ -379,15 +384,55 @@ namespace UglyToad.PdfPig.Writer
{ {
if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type) if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type)
{ {
// don't copy these as they'll be handled during page tree writing
continue; continue;
} }
if (kvp.Key == NameToken.Resources) if (kvp.Key == NameToken.Resources)
{ {
// merge parent resources into child
CopyResourceDict(kvp.Value, resources); CopyResourceDict(kvp.Value, resources);
continue; continue;
} }
if (kvp.Key == NameToken.Annots)
{
var val = kvp.Value;
if (kvp.Value is IndirectReferenceToken ir)
{
val = document.Structure.TokenScanner.Get(ir.Data).Data;
}
if (!(val is ArrayToken arr))
{
// should be array... ignore and remove bad dict
continue;
}
// -> ignore links to resolve issues with refencing non-existing pages
// at some point should add support for copying the links if the
// pages are copied as well but for now just fix corruption
var toAdd = new List<IToken>();
foreach (var annot in arr.Data)
{
DictionaryToken tk = GetRemoteDict(annot);
if (tk == null)
{
// malformed
continue;
}
if (tk.TryGet(NameToken.Subtype, out var st) && st is NameToken nm && nm == NameToken.Link)
{
// link -> ignore
continue;
}
toAdd.Add(WriterUtil.CopyToken(context, tk, document.Structure.TokenScanner, refs));
}
// copy rest
copiedPageDict[NameToken.Annots] = new ArrayToken(toAdd);
continue;
}
copiedPageDict[NameToken.Create(kvp.Key)] = copiedPageDict[NameToken.Create(kvp.Key)] =
WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs); WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs);
} }
@ -508,10 +553,14 @@ namespace UglyToad.PdfPig.Writer
pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize); pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize);
} }
// Adobe Acrobat errors if content streams ref'd by multiple pages, turn off
// dedup if on to avoid issues
var prev = context.AttemptDeduplication;
context.AttemptDeduplication = false;
var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList(); var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList();
if (toWrite.Count == 0) if (toWrite.Count == 0)
{ {
// write empty
pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context); pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context);
} }
else if (toWrite.Count == 1) else if (toWrite.Count == 1)
@ -529,7 +578,7 @@ namespace UglyToad.PdfPig.Writer
} }
pageDictionary[NameToken.Contents] = new ArrayToken(streams); pageDictionary[NameToken.Contents] = new ArrayToken(streams);
} }
context.AttemptDeduplication = prev;;
leafChildren[leafNum].Add(context.WriteToken(new DictionaryToken(pageDictionary))); leafChildren[leafNum].Add(context.WriteToken(new DictionaryToken(pageDictionary)));

View File

@ -30,8 +30,9 @@
DisposeStream = disposeStream; DisposeStream = disposeStream;
} }
public Stream Stream { get; protected set; } public Stream Stream { get; protected set; }
public bool AttemptDeduplication { get; set; } = true;
public virtual IndirectReferenceToken WriteToken(IToken token) public virtual IndirectReferenceToken WriteToken(IToken token)
{ {
if (!Initialized) if (!Initialized)

View File

@ -80,7 +80,11 @@
/// <param name="token">The token to write to the stream.</param> /// <param name="token">The token to write to the stream.</param>
/// <param name="outputStream">The stream to write the token to.</param> /// <param name="outputStream">The stream to write the token to.</param>
public static void WriteToken(IToken token, Stream outputStream) public static void WriteToken(IToken token, Stream outputStream)
{ {
if (token == null)
{
throw new ArgumentNullException(nameof(token));
}
switch (token) switch (token)
{ {
case ArrayToken array: case ArrayToken array:
@ -119,7 +123,9 @@
break; break;
case StringToken stringToken: case StringToken stringToken:
WriteString(stringToken, outputStream); WriteString(stringToken, outputStream);
break; break;
default:
throw new PdfDocumentFormatException($"Attempted to write token type of {token.GetType()} but was not known.");
} }
} }
@ -294,8 +300,16 @@
foreach (var pair in dictionary.Data) foreach (var pair in dictionary.Data)
{ {
WriteName(pair.Key, outputStream); WriteName(pair.Key, outputStream);
WriteToken(pair.Value, outputStream);
// handle scenario where PdfPig has a null value under some circumstances
if (pair.Value == null)
{
WriteToken(NullToken.Instance, outputStream);
} else
{
WriteToken(pair.Value, outputStream);
}
} }
outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length); outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length);