Merge pull request #359 from plaisted/master

link annotation fix for PdfDocumentBuilder
This commit is contained in:
Eliot Jones 2021-08-27 07:38:22 -04:00 committed by GitHub
commit df3552c38e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 111 additions and 10 deletions

View File

@ -101,6 +101,33 @@
}
}
[Fact]
public void CanFastAddPageAndStripLinkAnnots()
{
var first = IntegrationHelpers.GetDocumentPath("outline.pdf");
var contents = File.ReadAllBytes(first);
var annotCount = 0;
byte[] results = null;
using (var existing = PdfDocument.Open(contents, ParsingOptions.LenientParsingOff))
using (var output = new PdfDocumentBuilder())
{
output.AddPage(existing, 1);
results = output.Build();
var pg = existing.GetPage(1);
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
annotCount = annots.Count;
Assert.Contains(annots, x => x.Type == Annotations.AnnotationType.Link);
}
using (var rewritten = PdfDocument.Open(results, ParsingOptions.LenientParsingOff))
{
var pg = rewritten.GetPage(1);
var annots = pg.ExperimentalAccess.GetAnnotations().ToList();
Assert.Equal(annotCount - 1, annots.Count);
Assert.DoesNotContain(annots, x => x.Type == Annotations.AnnotationType.Link);
}
}
[Fact]
public void CanReadSingleBlankPage()

View File

@ -8,6 +8,13 @@
internal interface IPdfStreamWriter : IDisposable
{
/// <summary>
/// Sets if the stream writer should attempt to deduplicate objects.
/// May not have any affect if <see cref="IPdfStreamWriter"/> does not
/// support deduplication.
/// </summary>
bool AttemptDeduplication { get; set; }
/// <summary>
/// The underlying stream used by the writer.
/// </summary>

View File

@ -23,13 +23,16 @@
ms.SetLength(0);
TokenWriter.WriteToken(token, ms);
var contents = ms.ToArray();
if (hashes.TryGetValue(contents, out var value))
if (AttemptDeduplication && hashes.TryGetValue(contents, out var value))
{
return value;
}
var ir = ReserveObjectNumber();
hashes.Add(contents, ir);
if (AttemptDeduplication)
{
hashes.Add(contents, ir);
}
offsets.Add(ir.Data, Stream.Position);
TokenWriter.WriteObject(ir.Data.ObjectNumber, ir.Data.Generation, contents, Stream);

View File

@ -330,6 +330,10 @@ namespace UglyToad.PdfPig.Writer
var streams = new List<PdfPageBuilder.CopiedContentStream>();
if (pageInfo.Page.TryGet(NameToken.Contents, out IToken contentsToken))
{
// Adobe Acrobat errors if content streams ref'd by multiple pages, turn off
// dedup if on to avoid issues
var prev = context.AttemptDeduplication;
context.AttemptDeduplication = false;
if (contentsToken is ArrayToken array)
{
foreach (var item in array.Data)
@ -347,6 +351,7 @@ namespace UglyToad.PdfPig.Writer
streams.Add(new PdfPageBuilder.CopiedContentStream(
WriterUtil.CopyToken(context, ir, document.Structure.TokenScanner, refs) as IndirectReferenceToken));
}
context.AttemptDeduplication = prev;
}
// manually copy page dict / resources as we need to modify some
@ -379,15 +384,55 @@ namespace UglyToad.PdfPig.Writer
{
if (kvp.Key == NameToken.Contents || kvp.Key == NameToken.Parent || kvp.Key == NameToken.Type)
{
// don't copy these as they'll be handled during page tree writing
continue;
}
if (kvp.Key == NameToken.Resources)
{
// merge parent resources into child
CopyResourceDict(kvp.Value, resources);
continue;
}
if (kvp.Key == NameToken.Annots)
{
var val = kvp.Value;
if (kvp.Value is IndirectReferenceToken ir)
{
val = document.Structure.TokenScanner.Get(ir.Data).Data;
}
if (!(val is ArrayToken arr))
{
// should be array... ignore and remove bad dict
continue;
}
// -> ignore links to resolve issues with refencing non-existing pages
// at some point should add support for copying the links if the
// pages are copied as well but for now just fix corruption
var toAdd = new List<IToken>();
foreach (var annot in arr.Data)
{
DictionaryToken tk = GetRemoteDict(annot);
if (tk == null)
{
// malformed
continue;
}
if (tk.TryGet(NameToken.Subtype, out var st) && st is NameToken nm && nm == NameToken.Link)
{
// link -> ignore
continue;
}
toAdd.Add(WriterUtil.CopyToken(context, tk, document.Structure.TokenScanner, refs));
}
// copy rest
copiedPageDict[NameToken.Annots] = new ArrayToken(toAdd);
continue;
}
copiedPageDict[NameToken.Create(kvp.Key)] =
WriterUtil.CopyToken(context, kvp.Value, document.Structure.TokenScanner, refs);
}
@ -508,10 +553,14 @@ namespace UglyToad.PdfPig.Writer
pageDictionary[NameToken.MediaBox] = RectangleToArray(page.Value.PageSize);
}
// Adobe Acrobat errors if content streams ref'd by multiple pages, turn off
// dedup if on to avoid issues
var prev = context.AttemptDeduplication;
context.AttemptDeduplication = false;
var toWrite = page.Value.contentStreams.Where(x => x.HasContent).ToList();
if (toWrite.Count == 0)
{
// write empty
pageDictionary[NameToken.Contents] = new PdfPageBuilder.DefaultContentStream().Write(context);
}
else if (toWrite.Count == 1)
@ -529,7 +578,7 @@ namespace UglyToad.PdfPig.Writer
}
pageDictionary[NameToken.Contents] = new ArrayToken(streams);
}
context.AttemptDeduplication = prev;;
leafChildren[leafNum].Add(context.WriteToken(new DictionaryToken(pageDictionary)));

View File

@ -30,8 +30,9 @@
DisposeStream = disposeStream;
}
public Stream Stream { get; protected set; }
public Stream Stream { get; protected set; }
public bool AttemptDeduplication { get; set; } = true;
public virtual IndirectReferenceToken WriteToken(IToken token)
{
if (!Initialized)

View File

@ -80,7 +80,11 @@
/// <param name="token">The token to write to the stream.</param>
/// <param name="outputStream">The stream to write the token to.</param>
public static void WriteToken(IToken token, Stream outputStream)
{
{
if (token == null)
{
throw new ArgumentNullException(nameof(token));
}
switch (token)
{
case ArrayToken array:
@ -119,7 +123,9 @@
break;
case StringToken stringToken:
WriteString(stringToken, outputStream);
break;
break;
default:
throw new PdfDocumentFormatException($"Attempted to write token type of {token.GetType()} but was not known.");
}
}
@ -294,8 +300,16 @@
foreach (var pair in dictionary.Data)
{
WriteName(pair.Key, outputStream);
WriteToken(pair.Value, outputStream);
WriteName(pair.Key, outputStream);
// handle scenario where PdfPig has a null value under some circumstances
if (pair.Value == null)
{
WriteToken(NullToken.Instance, outputStream);
} else
{
WriteToken(pair.Value, outputStream);
}
}
outputStream.Write(DictionaryEnd, 0, DictionaryEnd.Length);