PdfPig/src/UglyToad.PdfPig/PdfDocument.cs
Jason Nelson da44e1a540
Improve code quality (#825)
* Avoid encoding ASCII in more cases

* Make Space a const

* Use WriteWhiteSpace extension to eliminate possible virtual call

* Use ASCII when encoding constrained character subset

* Simplify pragmas

* Revert Whitespace rename

* Fix using statement order

* Remove obsolete serialization support on .NET

* Remove obsolete serialization support on .NET (part 2)
2024-05-03 07:36:19 +01:00

315 lines
12 KiB
C#

namespace UglyToad.PdfPig
{
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using AcroForms;
using Content;
using Core;
using CrossReference;
using Encryption;
using Exceptions;
using Filters;
using Parser;
using Tokenization.Scanner;
using Tokens;
using Outline;
using Outline.Destinations;
/// <inheritdoc />
/// <summary>
/// Provides access to document level information for this PDF document as well as access to the <see cref="T:UglyToad.PdfPig.Content.Page"/>s contained in the document.
/// </summary>
public class PdfDocument : IDisposable
{
private bool isDisposed;
private readonly Lazy<AcroForm> documentForm;
private readonly HeaderVersion version;
private readonly IInputBytes inputBytes;
private readonly EncryptionDictionary? encryptionDictionary;
private readonly IPdfTokenScanner pdfScanner;
private readonly ILookupFilterProvider filterProvider;
private readonly BookmarksProvider bookmarksProvider;
private readonly ParsingOptions parsingOptions;
private readonly Pages pages;
private readonly NamedDestinations namedDestinations;
/// <summary>
/// The metadata associated with this document.
/// </summary>
public DocumentInformation Information { get; }
/// <summary>
/// Access to the underlying raw structure of the document.
/// </summary>
public Structure Structure { get; }
/// <summary>
/// Access to rare or advanced features of the PDF specification.
/// </summary>
public AdvancedPdfDocumentAccess Advanced { get; }
/// <summary>
/// The version number of the PDF specification which this file conforms to, for example 1.4.
/// </summary>
public double Version => version.Version;
/// <summary>
/// Get the number of pages in this document.
/// </summary>
public int NumberOfPages => pages.Count;
/// <summary>
/// Whether the document content is encrypted.
/// </summary>
[MemberNotNullWhen(true, nameof(encryptionDictionary))]
public bool IsEncrypted => encryptionDictionary != null;
internal PdfDocument(
IInputBytes inputBytes,
HeaderVersion version,
CrossReferenceTable crossReferenceTable,
Catalog catalog,
DocumentInformation information,
EncryptionDictionary? encryptionDictionary,
IPdfTokenScanner pdfScanner,
ILookupFilterProvider filterProvider,
AcroFormFactory acroFormFactory,
BookmarksProvider bookmarksProvider,
ParsingOptions parsingOptions)
{
this.inputBytes = inputBytes;
this.version = version ?? throw new ArgumentNullException(nameof(version));
this.encryptionDictionary = encryptionDictionary;
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
this.bookmarksProvider = bookmarksProvider ?? throw new ArgumentNullException(nameof(bookmarksProvider));
this.parsingOptions = parsingOptions;
Information = information ?? throw new ArgumentNullException(nameof(information));
pages = catalog.Pages;
namedDestinations = catalog.NamedDestinations;
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
Advanced = new AdvancedPdfDocumentAccess(pdfScanner, filterProvider, catalog);
documentForm = new Lazy<AcroForm>(() => acroFormFactory.GetAcroForm(catalog)!);
}
/// <summary>
/// Creates a <see cref="PdfDocument"/> for reading from the provided file bytes.
/// </summary>
/// <param name="fileBytes">The bytes of the PDF file.</param>
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open(byte[] fileBytes, ParsingOptions? options = null) => PdfDocumentFactory.Open(fileBytes, options);
/// <summary>
/// Opens a file and creates a <see cref="PdfDocument"/> for reading from the provided file path.
/// </summary>
/// <param name="filePath">The full path to the file location of the PDF file.</param>
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open(string filePath, ParsingOptions? options = null) => PdfDocumentFactory.Open(filePath, options);
/// <summary>
/// Creates a <see cref="PdfDocument"/> for reading from the provided stream.
/// The caller must manage disposing the stream. The created PdfDocument will not dispose the stream.
/// </summary>
/// <param name="stream">
/// A stream of the file contents, this must support reading and seeking.
/// The PdfDocument will not dispose of the provided stream.
/// </param>
/// <param name="options">Optional parameters controlling parsing.</param>
/// <returns>A <see cref="PdfDocument"/> providing access to the file contents.</returns>
public static PdfDocument Open(Stream stream, ParsingOptions? options = null) => PdfDocumentFactory.Open(stream, options);
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageFactory"></param>
public void AddPageFactory<TPage>(IPageFactory<TPage> pageFactory)
{
pages.AddPageFactory(pageFactory);
}
/// <summary>
/// TODO
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <typeparam name="TPageFactory"></typeparam>
#if NET
public void AddPageFactory<TPage, [System.Diagnostics.CodeAnalysis.DynamicallyAccessedMembers(System.Diagnostics.CodeAnalysis.DynamicallyAccessedMemberTypes.PublicConstructors)] TPageFactory>() where TPageFactory : IPageFactory<TPage>
#else
public void AddPageFactory<TPage, TPageFactory>() where TPageFactory : IPageFactory<TPage>
#endif
{
pages.AddPageFactory<TPage, TPageFactory>();
}
/// <summary>
/// Get the page with the specified page number (1 indexed).
/// </summary>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
public Page GetPage(int pageNumber)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
}
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
try
{
return pages.GetPage(pageNumber, namedDestinations, parsingOptions);
}
catch (Exception ex)
{
if (IsEncrypted)
{
throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary, ex);
}
throw;
}
}
/// <summary>
/// Get the page with the specified page number (1 indexed), using the specified page factory.
/// </summary>
/// <typeparam name="TPage"></typeparam>
/// <param name="pageNumber">The number of the page to return, this starts from 1.</param>
/// <returns>The page.</returns>
public TPage GetPage<TPage>(int pageNumber)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
}
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
try
{
return pages.GetPage<TPage>(pageNumber, namedDestinations, parsingOptions);
}
catch (Exception ex)
{
if (IsEncrypted)
{
throw new PdfDocumentEncryptedException("Document was encrypted which may have caused error when retrieving page.", encryptionDictionary!, ex);
}
throw;
}
}
/// <summary>
/// Gets all pages in this document in order.
/// </summary>
public IEnumerable<Page> GetPages()
{
for (var i = 0; i < NumberOfPages; i++)
{
yield return GetPage(i + 1);
}
}
/// <summary>
/// Gets all pages in this document in order, using the specified page factory.
/// </summary>
public IEnumerable<TPage> GetPages<TPage>()
{
for (var i = 0; i < NumberOfPages; i++)
{
yield return GetPage<TPage>(i + 1);
}
}
/// <summary>
/// Get the document level metadata if present.
/// The metadata is XML in the (Extensible Metadata Platform) XMP format.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
/// <param name="metadata">The metadata stream if it exists.</param>
/// <returns><see langword="true"/> if the metadata is present, <see langword="false"/> otherwise.</returns>
public bool TryGetXmpMetadata([NotNullWhen(true)] out XmpMetadata? metadata)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access the document metadata after the document is disposed.");
}
metadata = null;
if (!Structure.Catalog.CatalogDictionary.TryGet(NameToken.Metadata, pdfScanner, out StreamToken? xmpStreamToken))
{
return false;
}
metadata = new XmpMetadata(xmpStreamToken, filterProvider, pdfScanner);
return true;
}
/// <summary>
/// Gets the bookmarks if this document contains some.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
public bool TryGetBookmarks([NotNullWhen(true)] out Bookmarks? bookmarks)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access the bookmarks after the document is disposed.");
}
bookmarks = bookmarksProvider.GetBookmarks(Structure.Catalog);
return bookmarks != null;
}
/// <summary>
/// Gets the form if this document contains one.
/// </summary>
/// <remarks>This will throw a <see cref="ObjectDisposedException"/> if called on a disposed <see cref="PdfDocument"/>.</remarks>
/// <returns>An <see cref="AcroForm"/> from the document or <see langword="null"/> if not present.</returns>
public bool TryGetForm(out AcroForm form)
{
if (isDisposed)
{
throw new ObjectDisposedException("Cannot access the form after the document is disposed.");
}
form = documentForm.Value;
return form != null;
}
/// <inheritdoc />
/// <summary>
/// Dispose the <see cref="T:UglyToad.PdfPig.PdfDocument" /> and close any unmanaged resources.
/// </summary>
public void Dispose()
{
try
{
Advanced.Dispose();
pdfScanner.Dispose();
inputBytes.Dispose();
pages.Dispose();
}
catch (Exception ex)
{
parsingOptions.Logger.Error("Failed disposing the PdfDocument due to an error.", ex);
}
finally
{
isDisposed = true;
}
}
}
}