diff --git a/src/UglyToad.PdfPig.Tests/Integration/AdvancedPdfDocumentAccessTests.cs b/src/UglyToad.PdfPig.Tests/Integration/AdvancedPdfDocumentAccessTests.cs index 064cc33f..fd4ce14c 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/AdvancedPdfDocumentAccessTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/AdvancedPdfDocumentAccessTests.cs @@ -13,7 +13,7 @@ using (var document = PdfDocument.Open(path)) { - var pg = document.Structure.Catalog.GetPageNode(1).NodeDictionary; + var pg = document.Structure.Catalog.Pages.GetPageNode(1).NodeDictionary; var contents = pg.Data[NameToken.Contents] as IndirectReferenceToken; document.Advanced.ReplaceIndirectObject(contents.Data, tk => { @@ -39,7 +39,7 @@ dict[NameToken.Length] = new NumericToken(0); var replacement = new StreamToken(new DictionaryToken(dict), new List()); - var pg = document.Structure.Catalog.GetPageNode(1).NodeDictionary; + var pg = document.Structure.Catalog.Pages.GetPageNode(1).NodeDictionary; var contents = pg.Data[NameToken.Contents] as IndirectReferenceToken; document.Advanced.ReplaceIndirectObject(contents.Data, replacement); diff --git a/src/UglyToad.PdfPig.Tests/Integration/AnnotationsTest.cs b/src/UglyToad.PdfPig.Tests/Integration/AnnotationsTest.cs new file mode 100644 index 00000000..5353b1de --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/AnnotationsTest.cs @@ -0,0 +1,51 @@ +namespace UglyToad.PdfPig.Tests.Integration +{ + using Actions; + using System.Linq; + using Xunit; + + public class AnnotationsTest + { + [Fact] + public void AnnotationsHaveActions() + { + var pdf = IntegrationHelpers.GetDocumentPath("toc"); + + using (var doc = PdfDocument.Open(pdf)) + { + var annots = doc.GetPage(1).ExperimentalAccess.GetAnnotations().ToArray(); + Assert.Equal(5, annots.Length); + Assert.All(annots, a => Assert.NotNull(a.Action)); + Assert.All(annots, a => Assert.IsType(a.Action)); + Assert.All(annots, a => Assert.True((a.Action as GoToAction).Destination.PageNumber > 0)); + } + } + + [Fact] + public void CheckAnnotationAppearanceStreams() + { + var pdf = IntegrationHelpers.GetSpecificTestDocumentPath("appearances"); + using (var doc = PdfDocument.Open(pdf)) + { + var annotations = doc.GetPage(1).ExperimentalAccess.GetAnnotations().ToArray(); + Assert.Equal(1, annotations.Length); + var annotation = annotations[0]; + + Assert.True(annotation.HasDownAppearance); + Assert.True(annotation.HasNormalAppearance); + Assert.False(annotation.HasRollOverAppearance); + + Assert.False(annotation.downAppearanceStream.IsStateless); + Assert.False(annotation.normalAppearanceStream.IsStateless); + + Assert.Contains("Off", annotation.downAppearanceStream.GetStates); + Assert.Contains("Yes", annotation.downAppearanceStream.GetStates); + + Assert.Contains("Off", annotation.normalAppearanceStream.GetStates); + Assert.Contains("Yes", annotation.normalAppearanceStream.GetStates); + + Assert.Equal("Off", annotation.appearanceState); + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tests/Integration/Documents/toc.pdf b/src/UglyToad.PdfPig.Tests/Integration/Documents/toc.pdf new file mode 100644 index 00000000..e2806369 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/Documents/toc.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/appearances.pdf b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/appearances.pdf new file mode 100644 index 00000000..fc872256 Binary files /dev/null and b/src/UglyToad.PdfPig.Tests/Integration/SpecificTestDocuments/appearances.pdf differ diff --git a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs index 07cc0e0f..ac26fba3 100644 --- a/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs +++ b/src/UglyToad.PdfPig.Tests/PublicApiScannerTests.cs @@ -58,11 +58,19 @@ "UglyToad.PdfPig.AcroForms.Fields.AcroSignatureField", "UglyToad.PdfPig.AcroForms.Fields.AcroTextField", "UglyToad.PdfPig.AcroForms.Fields.AcroTextFieldFlags", + "UglyToad.PdfPig.Actions.AbstractGoToAction", + "UglyToad.PdfPig.Actions.PdfAction", + "UglyToad.PdfPig.Actions.ActionType", + "UglyToad.PdfPig.Actions.GoToAction", + "UglyToad.PdfPig.Actions.GoToEAction", + "UglyToad.PdfPig.Actions.GoToRAction", + "UglyToad.PdfPig.Actions.UriAction", "UglyToad.PdfPig.AdvancedPdfDocumentAccess", "UglyToad.PdfPig.Annotations.Annotation", "UglyToad.PdfPig.Annotations.AnnotationBorder", "UglyToad.PdfPig.Annotations.AnnotationFlags", "UglyToad.PdfPig.Annotations.AnnotationType", + "UglyToad.PdfPig.Annotations.AppearanceStream", "UglyToad.PdfPig.Annotations.QuadPointsQuadrilateral", "UglyToad.PdfPig.Content.ArtifactMarkedContentElement", "UglyToad.PdfPig.Content.Catalog", @@ -207,6 +215,7 @@ "UglyToad.PdfPig.Outline.Bookmarks", "UglyToad.PdfPig.Outline.BookmarkNode", "UglyToad.PdfPig.Outline.DocumentBookmarkNode", + "UglyToad.PdfPig.Outline.EmbeddedBookmarkNode", "UglyToad.PdfPig.Outline.ExternalBookmarkNode", "UglyToad.PdfPig.Outline.UriBookmarkNode", "UglyToad.PdfPig.Outline.Destinations.ExplicitDestination", diff --git a/src/UglyToad.PdfPig.Tokens/NameToken.Constants.cs b/src/UglyToad.PdfPig.Tokens/NameToken.Constants.cs index 79d9da6f..ec8331fa 100644 --- a/src/UglyToad.PdfPig.Tokens/NameToken.Constants.cs +++ b/src/UglyToad.PdfPig.Tokens/NameToken.Constants.cs @@ -255,6 +255,7 @@ public static readonly NameToken G = new NameToken("G"); public static readonly NameToken Gamma = new NameToken("Gamma"); public static readonly NameToken GoTo = new NameToken("GoTo"); + public static readonly NameToken GoToE = new NameToken("GoToE"); public static readonly NameToken GoToR = new NameToken("GoToR"); public static readonly NameToken Group = new NameToken("Group"); public static readonly NameToken GtsPdfa1 = new NameToken("GTS_PDFA1"); diff --git a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs index 1e05043f..c3676902 100644 --- a/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs +++ b/src/UglyToad.PdfPig/AcroForms/AcroFormFactory.cs @@ -187,7 +187,7 @@ int? pageNumber = null; if (fieldDictionary.TryGet(NameToken.P, tokenScanner, out IndirectReferenceToken pageReference)) { - pageNumber = catalog.GetPageByReference(pageReference.Data)?.PageNumber; + pageNumber = catalog.Pages.GetPageByReference(pageReference.Data)?.PageNumber; } PdfRectangle? bounds = null; diff --git a/src/UglyToad.PdfPig/Actions/AbstractGoToAction.cs b/src/UglyToad.PdfPig/Actions/AbstractGoToAction.cs new file mode 100644 index 00000000..c14337dd --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/AbstractGoToAction.cs @@ -0,0 +1,24 @@ +namespace UglyToad.PdfPig.Actions; + +using Outline.Destinations; + +/// +/// Abstract class for GoTo-type actions (GoTo, GoToE, GoToR) that have a destination +/// +public abstract class AbstractGoToAction : PdfAction +{ + /// + /// Destination for the GoTo-type action + /// + public ExplicitDestination Destination { get; } + + /// + /// Constructor + /// + /// + /// + protected AbstractGoToAction(ActionType type, ExplicitDestination destination) : base(type) + { + Destination = destination; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Actions/ActionProvider.cs b/src/UglyToad.PdfPig/Actions/ActionProvider.cs new file mode 100644 index 00000000..b22f23da --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/ActionProvider.cs @@ -0,0 +1,107 @@ +namespace UglyToad.PdfPig.Actions +{ + using Core; + using Logging; + using Outline; + using Tokenization.Scanner; + using Tokens; + using Outline.Destinations; + using Util; + + internal static class ActionProvider + { + /// + /// Get an action (A) from dictionary. If GoTo, GoToR or GoToE, also fetches the action destination. + /// + /// + /// + /// + /// + /// + /// + /// + internal static bool TryGetAction(DictionaryToken dictionary, + NamedDestinations namedDestinations, + IPdfTokenScanner pdfScanner, + ILog log, + out PdfAction result) + { + result = null; + + if (!dictionary.TryGet(NameToken.A, pdfScanner, out DictionaryToken actionDictionary)) + { + return false; + } + + if (!actionDictionary.TryGet(NameToken.S, pdfScanner, out NameToken actionType)) + { + throw new PdfDocumentFormatException($"No action type (/S) specified for action: {actionDictionary}."); + } + + if (actionType.Equals(NameToken.GoTo)) + { + // For GoTo, D(estination) is required + if (DestinationProvider.TryGetDestination(actionDictionary, + NameToken.D, + namedDestinations, + pdfScanner, + log, + false, + out var destination)) + { + result = new GoToAction(destination); + return true; + } + } + else if (actionType.Equals(NameToken.GoToR)) + { + // For GoToR, F(ile) and D(estination) are required + if (actionDictionary.TryGetOptionalStringDirect(NameToken.F, pdfScanner, out var filename) + && DestinationProvider.TryGetDestination(actionDictionary, + NameToken.D, + namedDestinations, + pdfScanner, + log, + true, + out var destination)) + { + result = new GoToRAction(destination, filename); + return true; + } + } + else if (actionType.Equals(NameToken.GoToE)) + { + // For GoToE, D(estination) is required + if (DestinationProvider.TryGetDestination(actionDictionary, + NameToken.D, + namedDestinations, + pdfScanner, + log, + true, + out var destination)) + { + // F(ile specification) is optional + if (!actionDictionary.TryGetOptionalStringDirect(NameToken.F, + pdfScanner, + out var fileSpecification)) + { + fileSpecification = null; + } + + result = new GoToEAction(destination, fileSpecification); + return true; + } + } + else if (actionType.Equals(NameToken.Uri)) + { + if (!actionDictionary.TryGetOptionalStringDirect(NameToken.Uri, pdfScanner, out var uri)) + { + uri = null; + } + result = new UriAction(uri); + return true; + } + return false; + } + } +} diff --git a/src/UglyToad.PdfPig/Actions/ActionType.cs b/src/UglyToad.PdfPig/Actions/ActionType.cs new file mode 100644 index 00000000..9a76841d --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/ActionType.cs @@ -0,0 +1,80 @@ +namespace UglyToad.PdfPig.Actions; + +/// +/// Action types (PDF reference 8.5.3) +/// +public enum ActionType +{ + /// + /// Go to a destination in the current document. + /// + GoTo, + /// + /// (“Go-to remote”) Go to a destination in another document. + /// + GoToR, + /// + /// (“Go-to embedded”; PDF 1.6) Go to a destination in an embedded file. + /// + GoToE, + /// + /// Launch an application, usually to open a file. + /// + Launch, + /// + /// Begin reading an article thread. + /// + Thread, + /// + /// Resolve a uniform resource identifier. + /// + URI, + /// + /// (PDF 1.2) Play a sound. + /// + Sound, + /// + /// (PDF 1.2) Play a movie. + /// + Movie, + /// + /// (PDF 1.2) Set an annotation’s Hidden flag. + /// + Hide, + /// + /// (PDF 1.2) Execute an action predefined by the viewer application. + /// + Named, + /// + /// (PDF 1.2) Send data to a uniform resource locator. + /// + SubmitForm, + /// + /// (PDF 1.2) Set fields to their default values. + /// + ResetForm, + /// + /// (PDF 1.2) Import field values from a file. + /// + ImportData, + /// + /// (PDF 1.3) Execute a JavaScript script. + /// + JavaScript, + /// + /// (PDF 1.5) Set the states of optional content groups. + /// + SetOCGState, + /// + /// (PDF 1.5) Controls the playing of multimedia content. + /// + Rendition, + /// + /// (PDF 1.5) Updates the display of a document, using a transition dictionary. + /// + Trans, + /// + /// (PDF 1.6) Set the current view of a 3D annotation + /// + GoTo3DView +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Actions/GoToAction.cs b/src/UglyToad.PdfPig/Actions/GoToAction.cs new file mode 100644 index 00000000..aa94e00d --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/GoToAction.cs @@ -0,0 +1,17 @@ +namespace UglyToad.PdfPig.Actions; + +using Outline.Destinations; + +/// +/// GoTo action (with a destination inside the current document) +/// +public class GoToAction : AbstractGoToAction +{ + /// + /// Constructor + /// + /// + public GoToAction(ExplicitDestination destination) : base(ActionType.GoTo, destination) + { + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Actions/GoToEAction.cs b/src/UglyToad.PdfPig/Actions/GoToEAction.cs new file mode 100644 index 00000000..c57d2d3a --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/GoToEAction.cs @@ -0,0 +1,24 @@ +namespace UglyToad.PdfPig.Actions; + +using Outline.Destinations; + +/// +/// GoToE action (to go to a destination inside a file embedded within the PDF) +/// +public class GoToEAction : AbstractGoToAction +{ + /// + /// File specification of the embedded file + /// + public string FileSpecification { get; } + + /// + /// Constructor + /// + /// Destination within the embedded file + /// Specification of the embedded file + public GoToEAction(ExplicitDestination destination, string fileSpecification) : base(ActionType.GoToE, destination) + { + FileSpecification = fileSpecification; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Actions/GoToRAction.cs b/src/UglyToad.PdfPig/Actions/GoToRAction.cs new file mode 100644 index 00000000..8aa65849 --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/GoToRAction.cs @@ -0,0 +1,24 @@ +namespace UglyToad.PdfPig.Actions; + +using Outline.Destinations; + +/// +/// GoToR action, to go to a destination in a remote PDF +/// +public class GoToRAction : AbstractGoToAction +{ + /// + /// Filename of the remote PDF + /// + public string Filename { get; } + + /// + /// Constructor + /// + /// Destination within the remote PDF + /// Filename of the remote PDF + public GoToRAction(ExplicitDestination destination, string filename) : base(ActionType.GoToR, destination) + { + Filename = filename; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Actions/PdfAction.cs b/src/UglyToad.PdfPig/Actions/PdfAction.cs new file mode 100644 index 00000000..8c46e01e --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/PdfAction.cs @@ -0,0 +1,22 @@ +namespace UglyToad.PdfPig.Actions +{ + /// + /// Actions (PDF reference 8.5) + /// + public class PdfAction + { + /// + /// Type of action + /// + public ActionType Type { get; } + + /// + /// Constructor + /// + /// + protected PdfAction(ActionType type) + { + Type = type; + } + } +} diff --git a/src/UglyToad.PdfPig/Actions/UriAction.cs b/src/UglyToad.PdfPig/Actions/UriAction.cs new file mode 100644 index 00000000..ead37582 --- /dev/null +++ b/src/UglyToad.PdfPig/Actions/UriAction.cs @@ -0,0 +1,21 @@ +namespace UglyToad.PdfPig.Actions; + +/// +/// Action to open a URI +/// +public class UriAction : PdfAction +{ + /// + /// URI to open + /// + public string Uri { get; } + + /// + /// Constructor + /// + /// URI to open + public UriAction(string uri) : base(ActionType.URI) + { + Uri = uri; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Annotations/Annotation.cs b/src/UglyToad.PdfPig/Annotations/Annotation.cs index aa19dd4d..e2cecca9 100644 --- a/src/UglyToad.PdfPig/Annotations/Annotation.cs +++ b/src/UglyToad.PdfPig/Annotations/Annotation.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; using Core; + using Actions; using Tokens; using Util.JetBrains.Annotations; @@ -11,9 +12,10 @@ /// public class Annotation { - private readonly StreamToken normalAppearanceStream; - private readonly StreamToken rollOverAppearanceStream; - private readonly StreamToken downAppearanceStream; + internal readonly AppearanceStream normalAppearanceStream; + internal readonly AppearanceStream rollOverAppearanceStream; + internal readonly AppearanceStream downAppearanceStream; + internal readonly string appearanceState; /// /// The underlying PDF dictionary which this annotation was created from. @@ -66,6 +68,16 @@ /// public IReadOnlyList QuadPoints { get; } + /// + /// Action for this annotation, if any (can be null) + /// + public PdfAction Action { get; } + + /// + /// Indicates if a normal appearance is present for this annotation + /// + public bool HasNormalAppearance => normalAppearanceStream != null; + /// /// Indicates if a roll over appearance is present for this annotation (shown when you hover over this annotation) /// @@ -79,9 +91,12 @@ /// /// Create a new . /// - public Annotation(DictionaryToken annotationDictionary, AnnotationType type, PdfRectangle rectangle, string content, string name, string modifiedDate, + public Annotation(DictionaryToken annotationDictionary, AnnotationType type, PdfRectangle rectangle, + string content, string name, string modifiedDate, AnnotationFlags flags, AnnotationBorder border, IReadOnlyList quadPoints, - StreamToken normalAppearanceStream, StreamToken rollOverAppearanceStream, StreamToken downAppearanceStream) + PdfAction action, + AppearanceStream normalAppearanceStream, AppearanceStream rollOverAppearanceStream, + AppearanceStream downAppearanceStream, string appearanceState) { AnnotationDictionary = annotationDictionary ?? throw new ArgumentNullException(nameof(annotationDictionary)); Type = type; @@ -92,9 +107,11 @@ Flags = flags; Border = border; QuadPoints = quadPoints ?? EmptyArray.Instance; + Action = action; this.normalAppearanceStream = normalAppearanceStream; this.rollOverAppearanceStream = rollOverAppearanceStream; this.downAppearanceStream = downAppearanceStream; + this.appearanceState = appearanceState; } /// diff --git a/src/UglyToad.PdfPig/Annotations/AnnotationProvider.cs b/src/UglyToad.PdfPig/Annotations/AnnotationProvider.cs index 1e452846..2c7fdc73 100644 --- a/src/UglyToad.PdfPig/Annotations/AnnotationProvider.cs +++ b/src/UglyToad.PdfPig/Annotations/AnnotationProvider.cs @@ -1,9 +1,13 @@ namespace UglyToad.PdfPig.Annotations { + using Actions; using System; using System.Collections.Generic; using System.Linq; using Core; + using Logging; + using Outline; + using Outline.Destinations; using Parser.Parts; using Tokenization.Scanner; using Tokens; @@ -13,14 +17,18 @@ { private readonly IPdfTokenScanner tokenScanner; private readonly DictionaryToken pageDictionary; + private readonly NamedDestinations namedDestinations; + private readonly ILog log; private readonly TransformationMatrix matrix; public AnnotationProvider(IPdfTokenScanner tokenScanner, DictionaryToken pageDictionary, - TransformationMatrix matrix) + TransformationMatrix matrix, NamedDestinations namedDestinations, ILog log) { this.matrix = matrix; this.tokenScanner = tokenScanner ?? throw new ArgumentNullException(nameof(tokenScanner)); this.pageDictionary = pageDictionary ?? throw new ArgumentNullException(nameof(pageDictionary)); + this.namedDestinations = namedDestinations; + this.log = log; } public IEnumerable GetAnnotations() @@ -38,10 +46,9 @@ } var type = annotationDictionary.Get(NameToken.Subtype, tokenScanner); - var annotationType = type.ToAnnotationType(); + var action = GetAction(annotationDictionary); var rectangle = matrix.Transform(annotationDictionary.Get(NameToken.Rect, tokenScanner).ToRectangle(tokenScanner)); - var contents = GetNamedString(NameToken.Contents, annotationDictionary); var name = GetNamedString(NameToken.Nm, annotationDictionary); // As indicated in PDF reference 8.4.1, the modified date can be anything, but is usually a date formatted according to sec. 3.8.3 @@ -98,32 +105,67 @@ } } - StreamToken normalAppearanceStream = null, downAppearanceStream = null, rollOverAppearanceStream = null; + AppearanceStream normalAppearanceStream = null; + AppearanceStream downAppearanceStream = null; + AppearanceStream rollOverAppearanceStream = null; + if (annotationDictionary.TryGet(NameToken.Ap, out DictionaryToken appearanceDictionary)) { // The normal appearance of this annotation - if (appearanceDictionary.TryGet(NameToken.N, out IndirectReferenceToken normalAppearanceRef)) + if (AppearanceStreamFactory.TryCreate(appearanceDictionary, NameToken.N, tokenScanner, out AppearanceStream stream)) { - normalAppearanceStream = tokenScanner.Get(normalAppearanceRef.Data)?.Data as StreamToken; + normalAppearanceStream = stream; } + // If present, the 'roll over' appearance of this annotation (when hovering the mouse pointer over this annotation) - if (appearanceDictionary.TryGet(NameToken.R, out IndirectReferenceToken rollOverAppearanceRef)) + if (AppearanceStreamFactory.TryCreate(appearanceDictionary, NameToken.R, tokenScanner, out stream)) { - rollOverAppearanceStream = tokenScanner.Get(rollOverAppearanceRef.Data)?.Data as StreamToken; + rollOverAppearanceStream = stream; } + // If present, the 'down' appearance of this annotation (when you click on it) - if (appearanceDictionary.TryGet(NameToken.D, out IndirectReferenceToken downAppearanceRef)) + if (AppearanceStreamFactory.TryCreate(appearanceDictionary, NameToken.D, tokenScanner, out stream)) { - downAppearanceStream = tokenScanner.Get(downAppearanceRef.Data)?.Data as StreamToken; + downAppearanceStream = stream; } } + string appearanceState = null; + if (annotationDictionary.TryGet(NameToken.As, out NameToken appearanceStateToken)) + { + appearanceState = appearanceStateToken.Data; + } + yield return new Annotation(annotationDictionary, annotationType, rectangle, - contents, name, modifiedDate, flags, border, quadPointRectangles, - normalAppearanceStream, rollOverAppearanceStream, downAppearanceStream); + contents, name, modifiedDate, flags, border, quadPointRectangles, action, + normalAppearanceStream, rollOverAppearanceStream, downAppearanceStream, appearanceState); } } + private PdfAction GetAction(DictionaryToken annotationDictionary) + { + // If this annotation returns a direct destination, turn it into a GoTo action. + if (DestinationProvider.TryGetDestination(annotationDictionary, + NameToken.Dest, + namedDestinations, + tokenScanner, + log, + false, + out var destination)) + { + return new GoToAction(destination); + } + + // Try get action from the dictionary. + if (ActionProvider.TryGetAction(annotationDictionary, namedDestinations, tokenScanner, log, out var action)) + { + return action; + } + + // No action or destination found, return null + return null; + } + private string GetNamedString(NameToken name, DictionaryToken dictionary) { string content = null; diff --git a/src/UglyToad.PdfPig/Annotations/AppearanceStream.cs b/src/UglyToad.PdfPig/Annotations/AppearanceStream.cs new file mode 100644 index 00000000..98350e3f --- /dev/null +++ b/src/UglyToad.PdfPig/Annotations/AppearanceStream.cs @@ -0,0 +1,66 @@ +namespace UglyToad.PdfPig.Annotations; + +using System; +using System.Collections.Generic; +using Tokens; + +/// +/// Appearance stream (PDF Reference 8.4.4) that describes what an annotation looks like. Each stream is a Form XObject. +/// The appearance stream is either stateless (in which case is true) +/// or stateful, in which case is false and the states can be retrieved via . +/// The states can then be used to retrieve the state-specific appearances using . +/// +public class AppearanceStream +{ + private readonly IDictionary appearanceStreamsByState; + + private readonly StreamToken statelessAppearanceStream; + + /// + /// Indicates if this appearance stream is stateless, or whether you can get appearances by state. + /// + public bool IsStateless => statelessAppearanceStream != null; + + /// + /// Get list of states. If this is a stateless appearance stream, an empty collection is returned. + /// + public ICollection GetStates => appearanceStreamsByState != null ? appearanceStreamsByState.Keys : new string[0]; + + /// + /// Constructor for stateless appearance stream + /// + /// + internal AppearanceStream(StreamToken streamToken) + { + statelessAppearanceStream = streamToken; + } + + /// + /// Constructor for stateful appearance stream + /// + /// + internal AppearanceStream(IDictionary appearanceStreamsByState) + { + this.appearanceStreamsByState = appearanceStreamsByState; + } + + /// + /// Get appearance stream for particular state + /// + /// + /// + /// + /// + public StreamToken Get(string state) + { + if (appearanceStreamsByState == null) + { + throw new Exception("Cannot get appearance by state when this is a stateless appearance stream"); + } + if (!appearanceStreamsByState.ContainsKey(state)) + { + throw new ArgumentOutOfRangeException(nameof(state), $"Appearance stream does not have state '{state}' (available states: {string.Join(",", appearanceStreamsByState.Keys)})"); + } + return appearanceStreamsByState[state]; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Annotations/AppearanceStreamFactory.cs b/src/UglyToad.PdfPig/Annotations/AppearanceStreamFactory.cs new file mode 100644 index 00000000..f0196415 --- /dev/null +++ b/src/UglyToad.PdfPig/Annotations/AppearanceStreamFactory.cs @@ -0,0 +1,42 @@ +namespace UglyToad.PdfPig.Annotations; + +using System.Collections.Generic; +using Tokenization.Scanner; +using Tokens; + +internal static class AppearanceStreamFactory +{ + public static bool TryCreate(DictionaryToken appearanceDictionary, NameToken name, IPdfTokenScanner tokenScanner, out AppearanceStream appearanceStream) + { + if (appearanceDictionary.TryGet(name, out IndirectReferenceToken appearanceReference)) + { + var streamToken = tokenScanner.Get(appearanceReference.Data)?.Data as StreamToken; + appearanceStream = new AppearanceStream(streamToken); + return true; + } + + if (appearanceDictionary.TryGet(name, out DictionaryToken stateDictionary)) + { + var dict = new Dictionary(); + foreach (var state in stateDictionary.Data.Keys) + { + if (stateDictionary.Data.TryGetValue(state, out var stateRef) && + stateRef is IndirectReferenceToken appearanceRef) + { + var streamToken = tokenScanner.Get(appearanceRef.Data)?.Data as StreamToken; + dict[state] = streamToken; + } + + } + + if (dict.Count > 0) + { + appearanceStream = new AppearanceStream(dict); + return true; + } + } + + appearanceStream = null; + return false; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/Catalog.cs b/src/UglyToad.PdfPig/Content/Catalog.cs index 9bbdd71f..4153f259 100644 --- a/src/UglyToad.PdfPig/Content/Catalog.cs +++ b/src/UglyToad.PdfPig/Content/Catalog.cs @@ -1,8 +1,7 @@ namespace UglyToad.PdfPig.Content { using System; - using System.Collections.Generic; - using Core; + using Outline; using Tokens; using Util.JetBrains.Annotations; @@ -12,90 +11,24 @@ /// public class Catalog { - private readonly IReadOnlyDictionary pagesByNumber; - /// /// The catalog dictionary containing assorted information. /// [NotNull] public DictionaryToken CatalogDictionary { get; } - /// - /// Defines the page tree node which is the root of the pages tree for the document. - /// - [NotNull] - public DictionaryToken PagesDictionary { get; } - - /// - /// The page tree for this document containing all pages, page numbers and their dictionaries. - /// - public PageTreeNode PageTree { get; } + internal NamedDestinations NamedDestinations { get; } - /// - /// Number of discovered pages. - /// - public int? NumberOfDiscoveredPages => pagesByNumber?.Count; + internal Pages Pages { get; } /// /// Create a new . /// - internal Catalog(DictionaryToken catalogDictionary, DictionaryToken pagesDictionary, - PageTreeNode pageTree) + internal Catalog(DictionaryToken catalogDictionary, Pages pages, NamedDestinations namedDestinations) { CatalogDictionary = catalogDictionary ?? throw new ArgumentNullException(nameof(catalogDictionary)); - PagesDictionary = pagesDictionary ?? throw new ArgumentNullException(nameof(pagesDictionary)); - PageTree = pageTree ?? throw new ArgumentNullException(nameof(pageTree)); - - if (!pageTree.IsRoot) - { - throw new ArgumentException("Page tree must be the root page tree node.", nameof(pageTree)); - } - - var byNumber = new Dictionary(); - PopulatePageByNumberDictionary(pageTree, byNumber); - pagesByNumber = byNumber; - } - - private static void PopulatePageByNumberDictionary(PageTreeNode node, Dictionary result) - { - if (node.IsPage) - { - if (!node.PageNumber.HasValue) - { - throw new InvalidOperationException($"Node was page but did not have page number: {node}."); - } - - result[node.PageNumber.Value] = node; - return; - } - - foreach (var child in node.Children) - { - PopulatePageByNumberDictionary(child, result); - } - } - - internal PageTreeNode GetPageNode(int pageNumber) - { - if (!pagesByNumber.TryGetValue(pageNumber, out var node)) - { - throw new InvalidOperationException($"Could not find page node by number for: {pageNumber}."); - } - - return node; - } - - internal PageTreeNode GetPageByReference(IndirectReference reference) - { - foreach (var page in pagesByNumber) - { - if (page.Value.Reference.Equals(reference)) - { - return page.Value; - } - } - - return null; + Pages = pages ?? throw new ArgumentNullException(nameof(pages)); + NamedDestinations = namedDestinations; } } } diff --git a/src/UglyToad.PdfPig/Content/IPageFactory.cs b/src/UglyToad.PdfPig/Content/IPageFactory.cs index 172a3e51..c0380cad 100644 --- a/src/UglyToad.PdfPig/Content/IPageFactory.cs +++ b/src/UglyToad.PdfPig/Content/IPageFactory.cs @@ -1,13 +1,14 @@ namespace UglyToad.PdfPig.Content { + using Outline; using Tokens; internal interface IPageFactory { - Page Create( - int number, + Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, + NamedDestinations annotationProvider, InternalParsingOptions parsingOptions); } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Content/Pages.cs b/src/UglyToad.PdfPig/Content/Pages.cs index 3bd55c09..1066a991 100644 --- a/src/UglyToad.PdfPig/Content/Pages.cs +++ b/src/UglyToad.PdfPig/Content/Pages.cs @@ -1,5 +1,8 @@ namespace UglyToad.PdfPig.Content { + using Annotations; + using Core; + using Outline; using System; using System.Collections.Generic; using Tokenization.Scanner; @@ -8,29 +11,25 @@ internal class Pages { - private readonly Catalog catalog; private readonly IPageFactory pageFactory; private readonly IPdfTokenScanner pdfScanner; + private readonly Dictionary pagesByNumber; + public int Count => pagesByNumber.Count; + + /// + /// The page tree for this document containing all pages, page numbers and their dictionaries. + /// + public PageTreeNode PageTree { get; } - public int Count { get; } - - internal Pages(Catalog catalog, IPageFactory pageFactory, IPdfTokenScanner pdfScanner) + internal Pages(IPageFactory pageFactory, IPdfTokenScanner pdfScanner, PageTreeNode pageTree, Dictionary pagesByNumber) { - this.catalog = catalog ?? throw new ArgumentNullException(nameof(catalog)); this.pageFactory = pageFactory ?? throw new ArgumentNullException(nameof(pageFactory)); this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner)); - - Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count); - var CountOfPagesByPagesTree = catalog.PageTree.Children.Count; - var numberOfDiscoveredPages = catalog.NumberOfDiscoveredPages; - if (numberOfDiscoveredPages is null == false && Count != numberOfDiscoveredPages) - { - //log.Warning($"Dictionary Page Count {Count} different to discovered pages {numberOfDiscoveredPages}. Using {numberOfDiscoveredPages}."); - Count = numberOfDiscoveredPages.Value; - } + this.pagesByNumber = pagesByNumber; + PageTree = pageTree; } - - public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions) + + internal Page GetPage(int pageNumber, NamedDestinations namedDestinations, InternalParsingOptions parsingOptions) { if (pageNumber <= 0 || pageNumber > Count) { @@ -40,7 +39,7 @@ $"Page number {pageNumber} invalid, must be between 1 and {Count}."); } - var pageNode = catalog.GetPageNode(pageNumber); + var pageNode = GetPageNode(pageNumber); var pageStack = new Stack(); var currentNode = pageNode; @@ -71,14 +70,37 @@ pageTreeMembers.Rotation = rotateToken.Int; } } - var page = pageFactory.Create( pageNumber, pageNode.NodeDictionary, pageTreeMembers, + namedDestinations, parsingOptions); return page; } + + internal PageTreeNode GetPageNode(int pageNumber) + { + if (!pagesByNumber.TryGetValue(pageNumber, out var node)) + { + throw new InvalidOperationException($"Could not find page node by number for: {pageNumber}."); + } + + return node; + } + + internal PageTreeNode GetPageByReference(IndirectReference reference) + { + foreach (var page in pagesByNumber) + { + if (page.Value.Reference.Equals(reference)) + { + return page.Value; + } + } + + return null; + } } } diff --git a/src/UglyToad.PdfPig/Content/PagesFactory.cs b/src/UglyToad.PdfPig/Content/PagesFactory.cs new file mode 100644 index 00000000..2f2c244c --- /dev/null +++ b/src/UglyToad.PdfPig/Content/PagesFactory.cs @@ -0,0 +1,241 @@ +namespace UglyToad.PdfPig.Content; + +using Core; +using Logging; +using Parser.Parts; +using System; +using System.Collections.Generic; +using System.Linq; +using Tokenization.Scanner; +using Tokens; +using Util; + +internal class PagesFactory +{ + private class PageCounter + { + public int PageCount { get; private set; } + public void Increment() + { + PageCount++; + } + } + + public static Pages Create(IndirectReference pagesReference, DictionaryToken pagesDictionary, IPdfTokenScanner scanner, IPageFactory pageFactory, ILog log, bool isLenientParsing) + { + var pageNumber = new PageCounter(); + + var pageTree = ProcessPagesNode(pagesReference, pagesDictionary, new IndirectReference(1, 0), true, + scanner, isLenientParsing, pageNumber); + + if (!pageTree.IsRoot) + { + throw new ArgumentException("Page tree must be the root page tree node.", nameof(pageTree)); + } + + var pagesByNumber = new Dictionary(); + PopulatePageByNumberDictionary(pageTree, pagesByNumber); + + var dictionaryPageCount = pagesDictionary.GetIntOrDefault(NameToken.Count); + if (dictionaryPageCount != pagesByNumber.Count) + { + log.Warn($"Dictionary Page Count {dictionaryPageCount} different to discovered pages {pagesByNumber.Count}. Using {pagesByNumber.Count}."); + } + + return new Pages(pageFactory, scanner, pageTree, pagesByNumber); + } + + + private static PageTreeNode ProcessPagesNode(IndirectReference referenceInput, + DictionaryToken nodeDictionaryInput, + IndirectReference parentReferenceInput, + bool isRoot, + IPdfTokenScanner pdfTokenScanner, + bool isLenientParsing, + PageCounter pageNumber) + { + bool isPage = CheckIfIsPage(nodeDictionaryInput, parentReferenceInput, isRoot, pdfTokenScanner, isLenientParsing); + + if (isPage) + { + pageNumber.Increment(); + + return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); + } + + + + //If we got here, we have to iterate till we manage to exit + + // Attempt to detect (and break) any infinite loop (IL) by recording the ids of the last 1000 (by default) tokens processed. + const int InfiniteLoopWorkingWindow = 1000; + var visitedTokens = new Dictionary>(); // Quick lookup containing ids (object number, generation) of tokens already processed (trimmed as we go to last 1000 (by default)) + var visitedTokensWorkingWindow = new Queue<(long ObjectNumber, int Generation)>(InfiniteLoopWorkingWindow); + + var toProcess = + new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, + List nodeChildren)>(); + var firstPage = new PageTreeNode(nodeDictionaryInput, referenceInput, false, null); + var setChildren = new List(); + var firstPageChildren = new List(); + + setChildren.Add(() => firstPage.WithChildren(firstPageChildren)); + + toProcess.Enqueue( + (thisPage: firstPage, reference: referenceInput, nodeDictionary: nodeDictionaryInput, parentReference: parentReferenceInput, + nodeChildren: firstPageChildren)); + + do + { + var current = toProcess.Dequeue(); + + #region Break any potential infinite loop + // Remember the last 1000 (by default) tokens and if we attempt to process again break out of loop + var currentReferenceObjectNumber = current.reference.ObjectNumber; + var currentReferenceGeneration = current.reference.Generation; + if (visitedTokens.ContainsKey(currentReferenceObjectNumber)) + { + var generations = visitedTokens[currentReferenceObjectNumber]; + + if (generations.Contains(currentReferenceGeneration)) + { + var listOfLastVisitedToken = visitedTokensWorkingWindow.ToList(); + var indexOfCurrentTokenInListOfLastVisitedToken = listOfLastVisitedToken.IndexOf((currentReferenceObjectNumber, currentReferenceGeneration)); + var howManyTokensBack = Math.Abs(indexOfCurrentTokenInListOfLastVisitedToken - listOfLastVisitedToken.Count); //eg initate loop is taking us back to last token or five token back + System.Diagnostics.Debug.WriteLine($"Break infinite loop while processing page {pageNumber.PageCount+1} tokens. Token with object number {currentReferenceObjectNumber} and generation {currentReferenceGeneration} processed {howManyTokensBack} token(s) back. "); + continue; // don't reprocess token already processed. break infinite loop. Issue #519 + } + else + { + generations.Add(currentReferenceGeneration); + visitedTokens[currentReferenceObjectNumber] = generations; + } + } + else + { + visitedTokens.Add(currentReferenceObjectNumber, new HashSet() { currentReferenceGeneration }); + + visitedTokensWorkingWindow.Enqueue((currentReferenceObjectNumber, currentReferenceGeneration)); + if (visitedTokensWorkingWindow.Count >= InfiniteLoopWorkingWindow) + { + var toBeRemovedFromWorkingHashset = visitedTokensWorkingWindow.Dequeue(); + var toBeRemovedObjectNumber = toBeRemovedFromWorkingHashset.ObjectNumber; + var toBeRemovedGeneration = toBeRemovedFromWorkingHashset.Generation; + var generations = visitedTokens[toBeRemovedObjectNumber]; + generations.Remove(toBeRemovedGeneration); + if (generations.Count == 0) + { + visitedTokens.Remove(toBeRemovedObjectNumber); + } + else + { + visitedTokens[toBeRemovedObjectNumber] = generations; + } + } + } + #endregion + if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) + { + if (!isLenientParsing) + { + throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {current.nodeDictionary}."); + } + + kids = new ArrayToken(EmptyArray.Instance); + } + + foreach (var kid in kids.Data) + { + if (!(kid is IndirectReferenceToken kidRef)) + { + throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); + } + + if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) + { + throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); + } + + bool isChildPage = CheckIfIsPage(kidDictionaryToken, current.reference, false, pdfTokenScanner, isLenientParsing); + + if (isChildPage) + { + var kidPageNode = + new PageTreeNode(kidDictionaryToken, kidRef.Data, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); + current.nodeChildren.Add(kidPageNode); + } + else + { + var kidChildNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, false, null); + var kidChildren = new List(); + toProcess.Enqueue( + (thisPage: kidChildNode, reference: kidRef.Data, nodeDictionary: kidDictionaryToken, parentReference: current.reference, + nodeChildren: kidChildren)); + + setChildren.Add(() => kidChildNode.WithChildren(kidChildren)); + + current.nodeChildren.Add(kidChildNode); + } + } + } while (toProcess.Count > 0); + + foreach (var action in setChildren) + { + action(); + } + + foreach (var child in firstPage.Children.ToRecursiveOrderList(x=>x.Children).Where(child => child.IsPage)) + { + pageNumber.Increment(); + child.PageNumber = pageNumber.PageCount; + } + + return firstPage; + } + + private static bool CheckIfIsPage(DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing) + { + var isPage = false; + + if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type)) + { + if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); } + + if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; } + } + else + { + isPage = type.Equals(NameToken.Page); + + if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); } + } + + if (!isLenientParsing && !isRoot) + { + if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); } + + if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); } + } + + return isPage; + } + + private static void PopulatePageByNumberDictionary(PageTreeNode node, Dictionary result) + { + if (node.IsPage) + { + if (!node.PageNumber.HasValue) + { + throw new InvalidOperationException($"Node was page but did not have page number: {node}."); + } + + result[node.PageNumber.Value] = node; + return; + } + + foreach (var child in node.Children) + { + PopulatePageByNumberDictionary(child, result); + } + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Outline/BookmarkNode.cs b/src/UglyToad.PdfPig/Outline/BookmarkNode.cs index 53753b82..8e507fef 100644 --- a/src/UglyToad.PdfPig/Outline/BookmarkNode.cs +++ b/src/UglyToad.PdfPig/Outline/BookmarkNode.cs @@ -1,5 +1,6 @@ namespace UglyToad.PdfPig.Outline { + using Destinations; using System; using System.Collections.Generic; diff --git a/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs b/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs index a1ede513..fc0d611e 100644 --- a/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs +++ b/src/UglyToad.PdfPig/Outline/BookmarksProvider.cs @@ -1,5 +1,6 @@ namespace UglyToad.PdfPig.Outline { + using Actions; using Content; using Destinations; using Logging; @@ -41,14 +42,12 @@ return null; } - var namedDestinations = ReadNamedDestinations(catalog, pdfScanner, log); - var roots = new List(); var seen = new HashSet(); while (next != null) { - ReadBookmarksRecursively(next, 0, false, seen, namedDestinations, catalog, roots); + ReadBookmarksRecursively(next, 0, false, seen, catalog.NamedDestinations, roots); if (!next.TryGet(NameToken.Next, out IndirectReferenceToken nextReference) || !seen.Add(nextReference.Data)) @@ -66,8 +65,7 @@ /// Extract bookmarks recursively. /// private void ReadBookmarksRecursively(DictionaryToken nodeDictionary, int level, bool readSiblings, HashSet seen, - IReadOnlyDictionary namedDestinations, - Catalog catalog, + NamedDestinations namedDestinations, List list) { // 12.3 Document-Level Navigation @@ -82,37 +80,37 @@ var children = new List(); if (nodeDictionary.TryGet(NameToken.First, pdfScanner, out DictionaryToken firstChild)) { - ReadBookmarksRecursively(firstChild, level + 1, true, seen, namedDestinations, catalog, children); + ReadBookmarksRecursively(firstChild, level + 1, true, seen, namedDestinations, children); } BookmarkNode bookmark; - if (nodeDictionary.TryGet(NameToken.Dest, pdfScanner, out ArrayToken destArray) - && TryGetExplicitDestination(destArray, catalog, log, out var destination)) + if (DestinationProvider.TryGetDestination(nodeDictionary, NameToken.Dest, namedDestinations, pdfScanner, log, false, out var destination)) { bookmark = new DocumentBookmarkNode(title, level, destination, children); } - else if (nodeDictionary.TryGet(NameToken.Dest, pdfScanner, out IDataToken destStringToken)) + else if (ActionProvider.TryGetAction(nodeDictionary, namedDestinations, pdfScanner, log, out var actionResult)) { - // 12.3.2.3 Named Destinations - if (namedDestinations.TryGetValue(destStringToken.Data, out destination)) + if (actionResult is GoToRAction goToRAction) { - bookmark = new DocumentBookmarkNode(title, level, destination, children); + bookmark = new ExternalBookmarkNode(title, level, goToRAction.Destination, children, goToRAction.Filename); + } + else if (actionResult is GoToAction goToAction) + { + bookmark = new DocumentBookmarkNode(title, level, goToAction.Destination, children); + } + else if (actionResult is UriAction uriAction) + { + bookmark = new UriBookmarkNode(title, level, uriAction.Uri, children); } else { return; } } - else if (nodeDictionary.TryGet(NameToken.A, pdfScanner, out DictionaryToken actionDictionary) - && TryGetAction(actionDictionary, catalog, pdfScanner, namedDestinations, log, title, level, children, out var actionResult)) - { - bookmark = actionResult; - } else { log.Error($"No /Dest(ination) or /A(ction) entry found for bookmark node: {nodeDictionary}."); - return; } @@ -140,267 +138,8 @@ break; } - ReadBookmarksRecursively(current, level, false, seen, namedDestinations, catalog, list); + ReadBookmarksRecursively(current, level, false, seen, namedDestinations, list); } } - - #region Named Destinations - private static IReadOnlyDictionary ReadNamedDestinations(Catalog catalog, IPdfTokenScanner pdfScanner, - ILog log) - { - var result = new Dictionary(); - - if (catalog.CatalogDictionary.TryGet(NameToken.Dests, pdfScanner, out DictionaryToken dests)) - { - /* - * In PDF 1.1, the correspondence between name objects and destinations is defined by the /Dests entry in the document catalog. - * The value of this entry is a dictionary in which each key is a destination name and the corresponding value is either an array - * defining the destination, using the explicit destination syntax, or a dictionary with a /D entry whose value is such an array. - */ - foreach (var kvp in dests.Data) - { - var value = kvp.Value; - - if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination)) - { - result[kvp.Key] = destination; - } - } - } - else if (catalog.CatalogDictionary.TryGet(NameToken.Names, pdfScanner, out DictionaryToken names) - && names.TryGet(NameToken.Dests, pdfScanner, out dests)) - { - /* - * In PDF 1.2, the correspondence between strings and destinations is defined by the /Dests entry in the document's name dictionary. - * The value of the /Dests entry is a name tree mapping name strings to destinations. - * The keys in the name tree may be treated as text strings for display purposes. - * The destination value associated with a key in the name tree may be either an array or a dictionary. - */ - NameTreeParser.FlattenNameTree(dests, pdfScanner, value => - { - if (TryReadExplicitDestination(value, catalog, pdfScanner, log, out var destination)) - { - return destination; - } - - return null; - }, result); - } - - return result; - } - - private static bool TryReadExplicitDestination(IToken value, Catalog catalog, IPdfTokenScanner pdfScanner, - ILog log, out ExplicitDestination destination) - { - destination = null; - - if (DirectObjectFinder.TryGet(value, pdfScanner, out ArrayToken valueArray) - && TryGetExplicitDestination(valueArray, catalog, log, out destination)) - { - return true; - } - - if (DirectObjectFinder.TryGet(value, pdfScanner, out DictionaryToken valueDictionary) - && valueDictionary.TryGet(NameToken.D, pdfScanner, out valueArray) - && TryGetExplicitDestination(valueArray, catalog, log, out destination)) - { - return true; - } - - return false; - } - - private static bool TryGetExplicitDestination(ArrayToken explicitDestinationArray, Catalog catalog, - ILog log, - out ExplicitDestination destination) - { - destination = null; - - if (explicitDestinationArray == null || explicitDestinationArray.Length == 0) - { - return false; - } - - int pageNumber; - - var pageToken = explicitDestinationArray[0]; - - if (pageToken is IndirectReferenceToken pageIndirectReferenceToken) - { - var page = catalog.GetPageByReference(pageIndirectReferenceToken.Data); - - if (page?.PageNumber == null) - { - return false; - } - - pageNumber = page.PageNumber.Value; - } - else if (pageToken is NumericToken pageNumericToken) - { - pageNumber = pageNumericToken.Int + 1; - } - else - { - var errorMessage = $"{nameof(TryGetExplicitDestination)} No page number given in 'Dest': '{explicitDestinationArray}'."; - - log.Error(errorMessage); - - return false; - } - - var destTypeToken = explicitDestinationArray[1] as NameToken; - if (destTypeToken == null) - { - var errorMessage = $"Missing name token as second argument to explicit destination: {explicitDestinationArray}."; - - log.Error(errorMessage); - - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitPage, ExplicitDestinationCoordinates.Empty); - - return true; - } - - if (destTypeToken.Equals(NameToken.XYZ)) - { - // [page /XYZ left top zoom] - var left = explicitDestinationArray[2] as NumericToken; - var top = explicitDestinationArray[3] as NumericToken; - - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.XyzCoordinates, - new ExplicitDestinationCoordinates(left?.Data, top?.Data)); - - return true; - } - - if (destTypeToken.Equals(NameToken.Fit)) - { - // [page /Fit] - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitPage, - ExplicitDestinationCoordinates.Empty); - - return true; - } - - if (destTypeToken.Equals(NameToken.FitH)) - { - // [page /FitH top] - var top = explicitDestinationArray[2] as NumericToken; - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitHorizontally, - new ExplicitDestinationCoordinates(null, top?.Data)); - - return true; - } - - if (destTypeToken.Equals(NameToken.FitV)) - { - // [page /FitV left] - var left = explicitDestinationArray[2] as NumericToken; - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitVertically, - new ExplicitDestinationCoordinates(left?.Data)); - - return true; - } - - if (destTypeToken.Equals(NameToken.FitR)) - { - // [page /FitR left bottom right top] - var left = explicitDestinationArray[2] as NumericToken; - var bottom = explicitDestinationArray[3] as NumericToken; - var right = explicitDestinationArray[4] as NumericToken; - var top = explicitDestinationArray[5] as NumericToken; - - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitRectangle, - new ExplicitDestinationCoordinates(left?.Data, top?.Data, right?.Data, bottom?.Data)); - - return true; - } - - if (destTypeToken.Equals(NameToken.FitB)) - { - // [page /FitB] - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitBoundingBox, - ExplicitDestinationCoordinates.Empty); - - return true; - } - - if (destTypeToken.Equals(NameToken.FitBH)) - { - // [page /FitBH top] - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitBoundingBoxHorizontally, - new ExplicitDestinationCoordinates(null, (explicitDestinationArray[2] as NumericToken)?.Data)); - - return true; - } - - if (destTypeToken.Equals(NameToken.FitBV)) - { - // [page /FitBV left] - destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitBoundingBoxVertically, - new ExplicitDestinationCoordinates((explicitDestinationArray[2] as NumericToken)?.Data)); - - return true; - } - - return false; - } - #endregion - - private static bool TryGetAction(DictionaryToken actionDictionary, Catalog catalog, IPdfTokenScanner pdfScanner, - IReadOnlyDictionary namedDestinations, - ILog log, string title, int level, List children, out BookmarkNode result) - { - result = null; - - if (!actionDictionary.TryGet(NameToken.S, pdfScanner, out NameToken actionType)) - { - throw new PdfDocumentFormatException($"No action type (/S) specified for action: {actionDictionary}."); - } - - if (actionType.Equals(NameToken.GoTo)) - { - if (actionDictionary.TryGet(NameToken.D, pdfScanner, out ArrayToken destinationArray) - && TryGetExplicitDestination(destinationArray, catalog, log, out var destination)) - { - result = new DocumentBookmarkNode(title, level, destination, children); - - return true; - } - - if (actionDictionary.TryGet(NameToken.D, pdfScanner, out IDataToken destinationName) - && namedDestinations.TryGetValue(destinationName.Data, out destination)) - { - result = new DocumentBookmarkNode(title, level, destination, children); - - return true; - } - } - else if (actionType.Equals(NameToken.GoToR)) - { - if (actionDictionary.TryGetOptionalStringDirect(NameToken.F, pdfScanner, out var filename)) - { - result = new ExternalBookmarkNode(title, level, filename, children); - return true; - } - - result = new ExternalBookmarkNode(title, level, string.Empty, children); - return true; - } - else if (actionType.Equals(NameToken.Uri)) - { - if (actionDictionary.TryGetOptionalStringDirect(NameToken.Uri, pdfScanner, out var uri)) - { - result = new UriBookmarkNode(title, level, uri, children); - return true; - } - - result = new UriBookmarkNode(title, level, string.Empty, children); - return true; - } - - return false; - } } } diff --git a/src/UglyToad.PdfPig/Outline/Destinations/DestinationProvider.cs b/src/UglyToad.PdfPig/Outline/Destinations/DestinationProvider.cs new file mode 100644 index 00000000..52f5d97c --- /dev/null +++ b/src/UglyToad.PdfPig/Outline/Destinations/DestinationProvider.cs @@ -0,0 +1,36 @@ +namespace UglyToad.PdfPig.Outline.Destinations +{ + using Logging; + using Tokenization.Scanner; + using Tokens; + + internal static class DestinationProvider + { + /// + /// Get explicit destination or a named destination (Ref 12.3.2.3) from dictionary + /// + /// + /// Token name, can be D or Dest + /// + /// + /// + /// in case we are looking up a destination for a GoToR (Go To Remote) action: pass in true + /// to enforce a check for indirect page references (which is not allowed for GoToR) + /// + /// + internal static bool TryGetDestination(DictionaryToken dictionary, NameToken destinationToken, NamedDestinations namedDestinations, IPdfTokenScanner pdfScanner, ILog log, bool isRemoteDestination, out ExplicitDestination destination) + { + if (dictionary.TryGet(destinationToken, pdfScanner, out ArrayToken destArray)) + { + return namedDestinations.TryGetExplicitDestination(destArray, log, isRemoteDestination, out destination); + } + if (dictionary.TryGet(destinationToken, pdfScanner, out IDataToken destStringToken)) + { + return namedDestinations.TryGet(destStringToken.Data, out destination); + } + destination = null; + return false; + } + + } +} diff --git a/src/UglyToad.PdfPig/Outline/Destinations/ExplicitDestination.cs b/src/UglyToad.PdfPig/Outline/Destinations/ExplicitDestination.cs index 854af717..d93ac31b 100644 --- a/src/UglyToad.PdfPig/Outline/Destinations/ExplicitDestination.cs +++ b/src/UglyToad.PdfPig/Outline/Destinations/ExplicitDestination.cs @@ -6,7 +6,7 @@ public class ExplicitDestination { /// - /// The page number of the destination. + /// The page number (1-based) of the destination. /// public int PageNumber { get; } diff --git a/src/UglyToad.PdfPig/Outline/Destinations/NamedDestinations.cs b/src/UglyToad.PdfPig/Outline/Destinations/NamedDestinations.cs new file mode 100644 index 00000000..45a8dc5e --- /dev/null +++ b/src/UglyToad.PdfPig/Outline/Destinations/NamedDestinations.cs @@ -0,0 +1,44 @@ +namespace UglyToad.PdfPig.Outline; + +using Content; +using Destinations; +using Logging; +using System.Collections.Generic; +using Tokens; + +/// +/// Named destinations in a PDF document +/// +internal class NamedDestinations +{ + /// + /// Dictionary containing explicit destinations, keyed by name + /// + private readonly IReadOnlyDictionary namedDestinations; + + /// + /// Pages are required for getting explicit destinations + /// + private readonly Pages pages; + + /// + /// Constructor + /// + /// + /// + internal NamedDestinations(IReadOnlyDictionary namedDestinations, Pages pages) + { + this.namedDestinations = namedDestinations; + this.pages = pages; + } + + internal bool TryGet(string name, out ExplicitDestination destination) + { + return namedDestinations.TryGetValue(name, out destination); + } + + internal bool TryGetExplicitDestination(ArrayToken explicitDestinationArray, ILog log, bool isRemoteDestination, out ExplicitDestination destination) + { + return NamedDestinationsProvider.TryGetExplicitDestination(explicitDestinationArray, pages, log, isRemoteDestination, out destination); + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Outline/Destinations/NamedDestinationsProvider.cs b/src/UglyToad.PdfPig/Outline/Destinations/NamedDestinationsProvider.cs new file mode 100644 index 00000000..bfef9dae --- /dev/null +++ b/src/UglyToad.PdfPig/Outline/Destinations/NamedDestinationsProvider.cs @@ -0,0 +1,220 @@ +namespace UglyToad.PdfPig.Outline; + +using Content; +using Destinations; +using Logging; +using Parser.Parts; +using System.Collections.Generic; +using Tokenization.Scanner; +using Tokens; + +internal static class NamedDestinationsProvider +{ + internal static NamedDestinations Read(DictionaryToken catalogDictionary, IPdfTokenScanner pdfScanner, Pages pages, ILog log) + { + var destinationsByName = new Dictionary(); + + if (catalogDictionary.TryGet(NameToken.Dests, pdfScanner, out DictionaryToken destinations)) + { + /* + * In PDF 1.1, the correspondence between name objects and destinations is defined by the /Dests entry in the document catalog. + * The value of this entry is a dictionary in which each key is a destination name and the corresponding value is either an array + * defining the destination, using the explicit destination syntax, or a dictionary with a /D entry whose value is such an array. + */ + foreach (var kvp in destinations.Data) + { + var value = kvp.Value; + + if (TryReadExplicitDestination(value, pdfScanner, pages, log, false, out var destination)) + { + destinationsByName[kvp.Key] = destination; + } + } + } + else if (catalogDictionary.TryGet(NameToken.Names, pdfScanner, out DictionaryToken names) + && names.TryGet(NameToken.Dests, pdfScanner, out destinations)) + { + /* + * In PDF 1.2, the correspondence between strings and destinations is defined by the /Dests entry in the document's name dictionary. + * The value of the /Dests entry is a name tree mapping name strings to destinations. + * The keys in the name tree may be treated as text strings for display purposes. + * The destination value associated with a key in the name tree may be either an array or a dictionary. + */ + NameTreeParser.FlattenNameTree(destinations, pdfScanner, value => + { + if (TryReadExplicitDestination(value, pdfScanner, pages, log, false, out var destination)) + { + return destination; + } + + return null; + }, destinationsByName); + } + + return new NamedDestinations(destinationsByName, pages); + } + + private static bool TryReadExplicitDestination(IToken value, IPdfTokenScanner pdfScanner, Pages pages, ILog log, bool isRemoteDestination, out ExplicitDestination destination) + { + destination = null; + + if (DirectObjectFinder.TryGet(value, pdfScanner, out ArrayToken valueArray) + && TryGetExplicitDestination(valueArray, pages, log, isRemoteDestination, out destination)) + { + return true; + } + + if (DirectObjectFinder.TryGet(value, pdfScanner, out DictionaryToken valueDictionary) + && valueDictionary.TryGet(NameToken.D, pdfScanner, out valueArray) + && TryGetExplicitDestination(valueArray, pages, log, isRemoteDestination, out destination)) + { + return true; + } + + return false; + } + + internal static bool TryGetExplicitDestination(ArrayToken explicitDestinationArray, Pages pages, ILog log, bool isRemoteDestination, out ExplicitDestination destination) + { + destination = null; + + if (explicitDestinationArray == null || explicitDestinationArray.Length == 0) + { + return false; + } + + int pageNumber; + + var pageToken = explicitDestinationArray[0]; + + if (pageToken is IndirectReferenceToken pageIndirectReferenceToken) + { + if (isRemoteDestination) + { + // Table 8.50 Remote Go-To Actions + var errorMessage = $"{nameof(TryGetExplicitDestination)} Cannot use indirect reference for remote destination."; + log?.Error(errorMessage); + return false; + } + var page = pages.GetPageByReference(pageIndirectReferenceToken.Data); + if (page?.PageNumber == null) + { + return false; + } + + pageNumber = page.PageNumber.Value; + } + else if (pageToken is NumericToken pageNumericToken) + { + pageNumber = pageNumericToken.Int + 1; + } + else + { + var errorMessage = $"{nameof(TryGetExplicitDestination)} No page number given in 'Dest': '{explicitDestinationArray}'."; + + log?.Error(errorMessage); + + return false; + } + + NameToken destTypeToken = null; + if (explicitDestinationArray.Length > 1) + { + destTypeToken = explicitDestinationArray[1] as NameToken; + } + if (destTypeToken == null) + { + var errorMessage = $"Missing name token as second argument to explicit destination: {explicitDestinationArray}."; + + log?.Error(errorMessage); + + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitPage, ExplicitDestinationCoordinates.Empty); + + return true; + } + + if (destTypeToken.Equals(NameToken.XYZ)) + { + // [page /XYZ left top zoom] + var left = explicitDestinationArray[2] as NumericToken; + var top = explicitDestinationArray[3] as NumericToken; + + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.XyzCoordinates, + new ExplicitDestinationCoordinates(left?.Data, top?.Data)); + + return true; + } + + if (destTypeToken.Equals(NameToken.Fit)) + { + // [page /Fit] + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitPage, + ExplicitDestinationCoordinates.Empty); + + return true; + } + + if (destTypeToken.Equals(NameToken.FitH)) + { + // [page /FitH top] + var top = explicitDestinationArray[2] as NumericToken; + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitHorizontally, + new ExplicitDestinationCoordinates(null, top?.Data)); + + return true; + } + + if (destTypeToken.Equals(NameToken.FitV)) + { + // [page /FitV left] + var left = explicitDestinationArray[2] as NumericToken; + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitVertically, + new ExplicitDestinationCoordinates(left?.Data)); + + return true; + } + + if (destTypeToken.Equals(NameToken.FitR)) + { + // [page /FitR left bottom right top] + var left = explicitDestinationArray[2] as NumericToken; + var bottom = explicitDestinationArray[3] as NumericToken; + var right = explicitDestinationArray[4] as NumericToken; + var top = explicitDestinationArray[5] as NumericToken; + + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitRectangle, + new ExplicitDestinationCoordinates(left?.Data, top?.Data, right?.Data, bottom?.Data)); + + return true; + } + + if (destTypeToken.Equals(NameToken.FitB)) + { + // [page /FitB] + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitBoundingBox, + ExplicitDestinationCoordinates.Empty); + + return true; + } + + if (destTypeToken.Equals(NameToken.FitBH)) + { + // [page /FitBH top] + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitBoundingBoxHorizontally, + new ExplicitDestinationCoordinates(null, (explicitDestinationArray[2] as NumericToken)?.Data)); + + return true; + } + + if (destTypeToken.Equals(NameToken.FitBV)) + { + // [page /FitBV left] + destination = new ExplicitDestination(pageNumber, ExplicitDestinationType.FitBoundingBoxVertically, + new ExplicitDestinationCoordinates((explicitDestinationArray[2] as NumericToken)?.Data)); + + return true; + } + + return false; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Outline/DocumentBookmarkNode.cs b/src/UglyToad.PdfPig/Outline/DocumentBookmarkNode.cs index bb57db6f..7cb9e6a2 100644 --- a/src/UglyToad.PdfPig/Outline/DocumentBookmarkNode.cs +++ b/src/UglyToad.PdfPig/Outline/DocumentBookmarkNode.cs @@ -1,6 +1,5 @@ namespace UglyToad.PdfPig.Outline { - using System; using System.Collections.Generic; using Destinations; @@ -14,7 +13,7 @@ /// /// The page number where the bookmark is located. /// - public int PageNumber { get; } + public int PageNumber => Destination.PageNumber; /// /// The destination of the bookmark in the current document. @@ -28,8 +27,7 @@ public DocumentBookmarkNode(string title, int level, ExplicitDestination destination, IReadOnlyList children) : base(title, level, children) { - Destination = destination ?? throw new ArgumentNullException(nameof(destination)); - PageNumber = destination.PageNumber; + Destination = destination; } /// diff --git a/src/UglyToad.PdfPig/Outline/EmbeddedBookmarkNode.cs b/src/UglyToad.PdfPig/Outline/EmbeddedBookmarkNode.cs new file mode 100644 index 00000000..06ee2d49 --- /dev/null +++ b/src/UglyToad.PdfPig/Outline/EmbeddedBookmarkNode.cs @@ -0,0 +1,33 @@ +namespace UglyToad.PdfPig.Outline; + +using Destinations; +using System; +using System.Collections.Generic; + +/// +/// +/// A node in the of a PDF document which corresponds +/// to a location in an embedded file. +/// +public class EmbeddedBookmarkNode : DocumentBookmarkNode +{ + /// + /// The file specification for the embedded file + /// + public string FileSpecification { get; } + + /// + /// + /// Create a new . + /// + public EmbeddedBookmarkNode(string title, int level, ExplicitDestination destination, IReadOnlyList children, string fileSpecification) : base(title, level, destination, children) + { + FileSpecification = fileSpecification ?? throw new ArgumentNullException(nameof(fileSpecification)); + } + + /// + public override string ToString() + { + return $"Embedded file '{FileSpecification}', {Level}, {Title}"; + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Outline/ExternalBookmarkNode.cs b/src/UglyToad.PdfPig/Outline/ExternalBookmarkNode.cs index e7c1b38e..3e98c2d5 100644 --- a/src/UglyToad.PdfPig/Outline/ExternalBookmarkNode.cs +++ b/src/UglyToad.PdfPig/Outline/ExternalBookmarkNode.cs @@ -1,5 +1,6 @@ namespace UglyToad.PdfPig.Outline { + using Destinations; using System; using System.Collections.Generic; @@ -8,7 +9,7 @@ /// A node in the of a PDF document which corresponds /// to a location in an external file. /// - public class ExternalBookmarkNode : BookmarkNode + public class ExternalBookmarkNode : DocumentBookmarkNode { /// /// The name of the file containing this bookmark. @@ -19,7 +20,7 @@ /// /// Create a new . /// - public ExternalBookmarkNode(string title, int level, string fileName, IReadOnlyList children) : base(title, level, children) + public ExternalBookmarkNode(string title, int level, ExplicitDestination destination, IReadOnlyList children, string fileName) : base(title, level, destination, children) { FileName = fileName ?? throw new ArgumentNullException(nameof(fileName)); } diff --git a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs index 88e69586..83c88449 100644 --- a/src/UglyToad.PdfPig/Parser/CatalogFactory.cs +++ b/src/UglyToad.PdfPig/Parser/CatalogFactory.cs @@ -1,30 +1,18 @@ namespace UglyToad.PdfPig.Parser { using System; - using System.Collections.Generic; using Content; using Core; + using Logging; + using Outline; using Parts; - using System.Linq; using Tokenization.Scanner; using Tokens; - using Util; internal static class CatalogFactory { - - private class PageCounter - { - public int PageCount { get; private set; } - public void Increment() - { - PageCount++; - } - } - public static Catalog Create(IndirectReference rootReference, DictionaryToken dictionary, - IPdfTokenScanner scanner, - bool isLenientParsing) + IPdfTokenScanner scanner, PageFactory pageFactory, ILog log, bool isLenientParsing) { if (dictionary == null) { @@ -41,203 +29,27 @@ throw new PdfDocumentFormatException($"No pages entry was found in the catalog dictionary: {dictionary}."); } - DictionaryToken pages; + DictionaryToken pagesDictionary; var pagesReference = rootReference; if (value is IndirectReferenceToken pagesRef) { pagesReference = pagesRef.Data; - pages = DirectObjectFinder.Get(pagesRef, scanner); + pagesDictionary = DirectObjectFinder.Get(pagesRef, scanner); } else if (value is DictionaryToken pagesDict) { - pages = pagesDict; + pagesDictionary = pagesDict; } else { - pages = DirectObjectFinder.Get(value, scanner); + pagesDictionary = DirectObjectFinder.Get(value, scanner); } - var pageNumber = new PageCounter(); + var pages = PagesFactory.Create(pagesReference, pagesDictionary, scanner, pageFactory, log, isLenientParsing); + var namedDestinations = NamedDestinationsProvider.Read(dictionary, scanner, pages, null); - var pageTree = ProcessPagesNode(pagesReference, pages, new IndirectReference(1, 0), true, - scanner, isLenientParsing, pageNumber); - - return new Catalog(dictionary, pages, pageTree); - } - - private static PageTreeNode ProcessPagesNode(IndirectReference referenceInput, - DictionaryToken nodeDictionaryInput, - IndirectReference parentReferenceInput, - bool isRoot, - IPdfTokenScanner pdfTokenScanner, - bool isLenientParsing, - PageCounter pageNumber) - { - bool isPage = CheckIfIsPage(nodeDictionaryInput, parentReferenceInput, isRoot, pdfTokenScanner, isLenientParsing); - - if (isPage) - { - pageNumber.Increment(); - - return new PageTreeNode(nodeDictionaryInput, referenceInput, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); - } - - - - //If we got here, we have to iterate till we manage to exit - - // Attempt to detect (and break) any infitine loop (IL) by recording the ids of the last 1000 (by default) tokens processed. - const int InfiniteLoopWorkingWindow = 1000; - var visitedTokens = new Dictionary>(); // Quick lookup containing ids (object number, generation) of tokens already processed (trimmed as we go to last 1000 (by default)) - var visitedTokensWorkingWindow = new Queue<(long ObjectNumber, int Generation)>(InfiniteLoopWorkingWindow); - - var toProcess = - new Queue<(PageTreeNode thisPage, IndirectReference reference, DictionaryToken nodeDictionary, IndirectReference parentReference, - List nodeChildren)>(); - var firstPage = new PageTreeNode(nodeDictionaryInput, referenceInput, false, null); - var setChildren = new List(); - var firstPageChildren = new List(); - - setChildren.Add(() => firstPage.WithChildren(firstPageChildren)); - - toProcess.Enqueue( - (thisPage: firstPage, reference: referenceInput, nodeDictionary: nodeDictionaryInput, parentReference: parentReferenceInput, - nodeChildren: firstPageChildren)); - - do - { - var current = toProcess.Dequeue(); - - #region Break any potential infinite loop - // Remember the last 1000 (by default) tokens and if we attempt to process again break out of loop - var currentReferenceObjectNumber = current.reference.ObjectNumber; - var currentReferenceGeneration = current.reference.Generation; - if (visitedTokens.ContainsKey(currentReferenceObjectNumber)) - { - var generations = visitedTokens[currentReferenceObjectNumber]; - - if (generations.Contains(currentReferenceGeneration)) - { - var listOfLastVisitedToken = visitedTokensWorkingWindow.ToList(); - var indexOfCurrentTokenInListOfLastVisitedToken = listOfLastVisitedToken.IndexOf((currentReferenceObjectNumber, currentReferenceGeneration)); - var howManyTokensBack = Math.Abs(indexOfCurrentTokenInListOfLastVisitedToken - listOfLastVisitedToken.Count); //eg initate loop is taking us back to last token or five token back - System.Diagnostics.Debug.WriteLine($"Break infinite loop while processing page {pageNumber.PageCount+1} tokens. Token with object number {currentReferenceObjectNumber} and generation {currentReferenceGeneration} processed {howManyTokensBack} token(s) back. "); - continue; // don't reprocess token already processed. break infinite loop. Issue #519 - } - else - { - generations.Add(currentReferenceGeneration); - visitedTokens[currentReferenceObjectNumber] = generations; - } - } - else - { - visitedTokens.Add(currentReferenceObjectNumber, new HashSet() { currentReferenceGeneration }); - - visitedTokensWorkingWindow.Enqueue((currentReferenceObjectNumber, currentReferenceGeneration)); - if (visitedTokensWorkingWindow.Count >= InfiniteLoopWorkingWindow) - { - var toBeRemovedFromWorkingHashset = visitedTokensWorkingWindow.Dequeue(); - var toBeRemovedObjectNumber = toBeRemovedFromWorkingHashset.ObjectNumber; - var toBeRemovedGeneration = toBeRemovedFromWorkingHashset.Generation; - var generations = visitedTokens[toBeRemovedObjectNumber]; - generations.Remove(toBeRemovedGeneration); - if (generations.Count == 0) - { - visitedTokens.Remove(toBeRemovedObjectNumber); - } - else - { - visitedTokens[toBeRemovedObjectNumber] = generations; - } - } - } - #endregion - if (!current.nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken kids)) - { - if (!isLenientParsing) - { - throw new PdfDocumentFormatException($"Pages node in the document pages tree did not define a kids array: {current.nodeDictionary}."); - } - - kids = new ArrayToken(EmptyArray.Instance); - } - - foreach (var kid in kids.Data) - { - if (!(kid is IndirectReferenceToken kidRef)) - { - throw new PdfDocumentFormatException($"Kids array contained invalid entry (must be indirect reference): {kid}."); - } - - if (!DirectObjectFinder.TryGet(kidRef, pdfTokenScanner, out DictionaryToken kidDictionaryToken)) - { - throw new PdfDocumentFormatException($"Could not find dictionary associated with reference in pages kids array: {kidRef}."); - } - - bool isChildPage = CheckIfIsPage(kidDictionaryToken, current.reference, false, pdfTokenScanner, isLenientParsing); - - if (isChildPage) - { - var kidPageNode = - new PageTreeNode(kidDictionaryToken, kidRef.Data, true, pageNumber.PageCount).WithChildren(EmptyArray.Instance); - current.nodeChildren.Add(kidPageNode); - } - else - { - var kidChildNode = new PageTreeNode(kidDictionaryToken, kidRef.Data, false, null); - var kidChildren = new List(); - toProcess.Enqueue( - (thisPage: kidChildNode, reference: kidRef.Data, nodeDictionary: kidDictionaryToken, parentReference: current.reference, - nodeChildren: kidChildren)); - - setChildren.Add(() => kidChildNode.WithChildren(kidChildren)); - - current.nodeChildren.Add(kidChildNode); - } - } - } while (toProcess.Count > 0); - - foreach (var action in setChildren) - { - action(); - } - - foreach (var child in firstPage.Children.ToRecursiveOrderList(x=>x.Children).Where(child => child.IsPage)) - { - pageNumber.Increment(); - child.PageNumber = pageNumber.PageCount; - } - - return firstPage; - } - - private static bool CheckIfIsPage(DictionaryToken nodeDictionary, IndirectReference parentReference, bool isRoot, IPdfTokenScanner pdfTokenScanner, bool isLenientParsing) - { - var isPage = false; - - if (!nodeDictionary.TryGet(NameToken.Type, pdfTokenScanner, out NameToken type)) - { - if (!isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree did not define a type: {nodeDictionary}."); } - - if (!nodeDictionary.TryGet(NameToken.Kids, pdfTokenScanner, out ArrayToken _)) { isPage = true; } - } - else - { - isPage = type.Equals(NameToken.Page); - - if (!isPage && !type.Equals(NameToken.Pages) && !isLenientParsing) { throw new PdfDocumentFormatException($"Node in the document pages tree defined invalid type: {nodeDictionary}."); } - } - - if (!isLenientParsing && !isRoot) - { - if (!nodeDictionary.TryGet(NameToken.Parent, pdfTokenScanner, out IndirectReferenceToken parentReferenceToken)) { throw new PdfDocumentFormatException($"Could not find parent indirect reference token on pages tree node: {nodeDictionary}."); } - - if (!parentReferenceToken.Data.Equals(parentReference)) { throw new PdfDocumentFormatException($"Pages tree node parent reference {parentReferenceToken.Data} did not match actual parent {parentReference}."); } - } - - return isPage; + return new Catalog(dictionary, pages, namedDestinations); } } } diff --git a/src/UglyToad.PdfPig/Parser/PageFactory.cs b/src/UglyToad.PdfPig/Parser/PageFactory.cs index 03467e70..b8ce5337 100644 --- a/src/UglyToad.PdfPig/Parser/PageFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PageFactory.cs @@ -10,6 +10,7 @@ using Graphics; using Graphics.Operations; using Logging; + using Outline; using Parts; using Tokenization.Scanner; using Tokens; @@ -21,20 +22,24 @@ private readonly IResourceStore resourceStore; private readonly ILookupFilterProvider filterProvider; private readonly IPageContentParser pageContentParser; + private readonly ILog log; public PageFactory( IPdfTokenScanner pdfScanner, IResourceStore resourceStore, ILookupFilterProvider filterProvider, - IPageContentParser pageContentParser) + IPageContentParser pageContentParser, + ILog log) { this.resourceStore = resourceStore; this.filterProvider = filterProvider; this.pageContentParser = pageContentParser; this.pdfScanner = pdfScanner; + this.log = log; } - public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, InternalParsingOptions parsingOptions) + public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, + NamedDestinations namedDestinations, InternalParsingOptions parsingOptions) { if (dictionary == null) { @@ -48,8 +53,8 @@ parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'."); } - MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers, parsingOptions.Logger); - CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox, parsingOptions.Logger); + MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers); + CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox); var rotation = new PageRotationDegrees(pageTreeMembers.Rotation); if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken)) @@ -133,11 +138,9 @@ content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions); } - var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, parsingOptions.Logger); - - var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, - new AnnotationProvider(pdfScanner, dictionary, initialMatrix), - pdfScanner); + var initialMatrix = ContentStreamProcessor.GetInitialMatrix(userSpaceUnit, mediaBox, cropBox, rotation, log); + var annotationProvider = new AnnotationProvider(pdfScanner, dictionary, initialMatrix, namedDestinations, log); + var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content, annotationProvider, pdfScanner); for (var i = 0; i < stackDepth; i++) { @@ -187,8 +190,7 @@ private CropBox GetCropBox( DictionaryToken dictionary, PageTreeMembers pageTreeMembers, - MediaBox mediaBox, - ILog log) + MediaBox mediaBox) { CropBox cropBox; if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) && @@ -216,23 +218,22 @@ private MediaBox GetMediaBox( int number, DictionaryToken dictionary, - PageTreeMembers pageTreeMembers, - ILog log) + PageTreeMembers pageTreeMembers) { MediaBox mediaBox; - if (dictionary.TryGet(NameToken.MediaBox, out var mediaboxObject) - && DirectObjectFinder.TryGet(mediaboxObject, pdfScanner, out ArrayToken mediaboxArray)) + if (dictionary.TryGet(NameToken.MediaBox, out var mediaBoxObject) + && DirectObjectFinder.TryGet(mediaBoxObject, pdfScanner, out ArrayToken mediaBoxArray)) { - if (mediaboxArray.Length != 4) + if (mediaBoxArray.Length != 4) { - log.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaboxArray}. Defaulting to US Letter."); + log.Error($"The MediaBox was the wrong length in the dictionary: {dictionary}. Array was: {mediaBoxArray}. Defaulting to US Letter."); mediaBox = MediaBox.Letter; return mediaBox; } - mediaBox = new MediaBox(mediaboxArray.ToRectangle(pdfScanner)); + mediaBox = new MediaBox(mediaBoxArray.ToRectangle(pdfScanner)); } else { diff --git a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs index 0edc962e..2b962177 100644 --- a/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs +++ b/src/UglyToad.PdfPig/Parser/PdfDocumentFactory.cs @@ -176,14 +176,18 @@ crossReferenceTable.Trailer, parsingOptions.UseLenientParsing); + var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, + new PageContentParser(new ReflectionGraphicsStateOperationFactory()), parsingOptions.Logger); + var catalog = CatalogFactory.Create( rootReference, rootDictionary, pdfScanner, + pageFactory, + parsingOptions.Logger, parsingOptions.UseLenientParsing); - var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider, - new PageContentParser(new ReflectionGraphicsStateOperationFactory())); + var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable); var bookmarksProvider = new BookmarksProvider(parsingOptions.Logger, pdfScanner); diff --git a/src/UglyToad.PdfPig/PdfDocument.cs b/src/UglyToad.PdfPig/PdfDocument.cs index 2cc59fe9..cf6610f6 100644 --- a/src/UglyToad.PdfPig/PdfDocument.cs +++ b/src/UglyToad.PdfPig/PdfDocument.cs @@ -14,6 +14,8 @@ using Tokenization.Scanner; using Tokens; using Outline; + using Outline.Destinations; + using System.Linq; using Util.JetBrains.Annotations; /// @@ -42,6 +44,7 @@ [NotNull] private readonly Pages pages; + private readonly NamedDestinations namedDestinations; /// /// The metadata associated with this document. @@ -75,13 +78,12 @@ /// public bool IsEncrypted => encryptionDictionary != null; - internal PdfDocument( - IInputBytes inputBytes, - HeaderVersion version, + internal PdfDocument(IInputBytes inputBytes, + HeaderVersion version, CrossReferenceTable crossReferenceTable, IPageFactory pageFactory, Catalog catalog, - DocumentInformation information, + DocumentInformation information, EncryptionDictionary encryptionDictionary, IPdfTokenScanner pdfScanner, ILookupFilterProvider filterProvider, @@ -98,7 +100,8 @@ this.parsingOptions = parsingOptions; Information = information ?? throw new ArgumentNullException(nameof(information)); - pages = new Pages(catalog, pageFactory, pdfScanner); + pages = catalog.Pages; + namedDestinations = catalog.NamedDestinations; Structure = new Structure(catalog, crossReferenceTable, pdfScanner); Advanced = new AdvancedPdfDocumentAccess(pdfScanner, filterProvider, catalog); documentForm = new Lazy(() => acroFormFactory.GetAcroForm(catalog)); @@ -148,7 +151,7 @@ try { - return pages.GetPage(pageNumber, parsingOptions); + return pages.GetPage(pageNumber, namedDestinations, parsingOptions); } catch (Exception ex) { diff --git a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs index 2472145c..ccb1f792 100644 --- a/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs +++ b/src/UglyToad.PdfPig/Writer/PdfDocumentBuilder.cs @@ -276,7 +276,6 @@ namespace UglyToad.PdfPig.Writer return AddPage(rectangle.Width, rectangle.Height); } - internal IToken CopyToken(IPdfTokenScanner source, IToken token) { if (!existingCopies.TryGetValue(source, out var refs)) @@ -288,15 +287,18 @@ namespace UglyToad.PdfPig.Writer return WriterUtil.CopyToken(context, token, source, refs); } - internal class PageInfo + private class PageInfo { public DictionaryToken Page { get; set; } public IReadOnlyList Parents { get; set; } } + private readonly ConditionalWeakTable> existingCopies = new ConditionalWeakTable>(); + private readonly ConditionalWeakTable> existingTrees = new ConditionalWeakTable>(); + /// /// Add a new page with the specified size, this page will be included in the output when is called. /// @@ -315,7 +317,7 @@ namespace UglyToad.PdfPig.Writer { pagesInfos = new Dictionary(); int i = 1; - foreach (var (pageDict, parents) in WriterUtil.WalkTree(document.Structure.Catalog.PageTree)) + foreach (var (pageDict, parents) in WriterUtil.WalkTree(document.Structure.Catalog.Pages.PageTree)) { pagesInfos[i] = new PageInfo {