mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
#482 add skip missing fonts option and pass parsing options to content stream processor
this doesn't fix the reported issue since the pdf itself is corrupted on page 8 however it will allow recovery in some scenarios where text content isn't important. also adds more informative error when stream unintentionally passed with non zero offset
This commit is contained in:
parent
c643facee0
commit
e2246a88bb
src/UglyToad.PdfPig
@ -4,6 +4,10 @@
|
||||
|
||||
internal interface IPageFactory
|
||||
{
|
||||
Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, bool clipPaths);
|
||||
Page Create(
|
||||
int number,
|
||||
DictionaryToken dictionary,
|
||||
PageTreeMembers pageTreeMembers,
|
||||
InternalParsingOptions parsingOptions);
|
||||
}
|
||||
}
|
@ -23,10 +23,12 @@
|
||||
Count = catalog.PagesDictionary.GetIntOrDefault(NameToken.Count);
|
||||
}
|
||||
|
||||
public Page GetPage(int pageNumber, bool clipPaths)
|
||||
public Page GetPage(int pageNumber, InternalParsingOptions parsingOptions)
|
||||
{
|
||||
if (pageNumber <= 0 || pageNumber > Count)
|
||||
{
|
||||
parsingOptions.Logger.Error($"Page {pageNumber} requested but is out of range.");
|
||||
|
||||
throw new ArgumentOutOfRangeException(nameof(pageNumber),
|
||||
$"Page number {pageNumber} invalid, must be between 1 and {Count}.");
|
||||
}
|
||||
@ -63,7 +65,11 @@
|
||||
}
|
||||
}
|
||||
|
||||
var page = pageFactory.Create(pageNumber, pageNode.NodeDictionary, pageTreeMembers, clipPaths);
|
||||
var page = pageFactory.Create(
|
||||
pageNumber,
|
||||
pageNode.NodeDictionary,
|
||||
pageTreeMembers,
|
||||
parsingOptions);
|
||||
|
||||
return page;
|
||||
}
|
||||
|
@ -5,7 +5,6 @@
|
||||
using Core;
|
||||
using Filters;
|
||||
using Geometry;
|
||||
using Logging;
|
||||
using Operations;
|
||||
using Parser;
|
||||
using PdfFonts;
|
||||
@ -49,9 +48,8 @@
|
||||
private readonly IPdfTokenScanner pdfScanner;
|
||||
private readonly IPageContentParser pageContentParser;
|
||||
private readonly ILookupFilterProvider filterProvider;
|
||||
private readonly ILog log;
|
||||
private readonly bool clipPaths;
|
||||
private readonly PdfVector pageSize;
|
||||
private readonly InternalParsingOptions parsingOptions;
|
||||
private readonly MarkedContentStack markedContentStack = new MarkedContentStack();
|
||||
|
||||
private Stack<CurrentGraphicsState> graphicsStack = new Stack<CurrentGraphicsState>();
|
||||
@ -90,9 +88,8 @@
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IPageContentParser pageContentParser,
|
||||
ILookupFilterProvider filterProvider,
|
||||
ILog log,
|
||||
bool clipPaths,
|
||||
PdfVector pageSize)
|
||||
PdfVector pageSize,
|
||||
InternalParsingOptions parsingOptions)
|
||||
{
|
||||
this.resourceStore = resourceStore;
|
||||
this.userSpaceUnit = userSpaceUnit;
|
||||
@ -100,9 +97,8 @@
|
||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||
this.pageContentParser = pageContentParser ?? throw new ArgumentNullException(nameof(pageContentParser));
|
||||
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||
this.log = log;
|
||||
this.clipPaths = clipPaths;
|
||||
this.pageSize = pageSize;
|
||||
this.parsingOptions = parsingOptions;
|
||||
|
||||
// initiate CurrentClippingPath to cropBox
|
||||
var clippingSubpath = new PdfSubpath();
|
||||
@ -230,6 +226,15 @@
|
||||
|
||||
if (font == null)
|
||||
{
|
||||
if (parsingOptions.SkipMissingFonts)
|
||||
{
|
||||
parsingOptions.Logger.Warn($"Skipping a missing font with name {currentState.FontState.FontName} " +
|
||||
$"since it is not present in the document and {nameof(InternalParsingOptions.SkipMissingFonts)} " +
|
||||
"is set to true. This may result in some text being skipped and not included in the output.");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException($"Could not find the font with name {currentState.FontState.FontName} in the resource store. It has not been loaded yet.");
|
||||
}
|
||||
|
||||
@ -253,7 +258,8 @@
|
||||
|
||||
if (!foundUnicode || unicode == null)
|
||||
{
|
||||
log.Warn($"We could not find the corresponding character with code {code} in font {font.Name}.");
|
||||
parsingOptions.Logger.Warn($"We could not find the corresponding character with code {code} in font {font.Name}.");
|
||||
|
||||
// Try casting directly to string as in PDFBox 1.8.
|
||||
unicode = new string((char)code, 1);
|
||||
}
|
||||
@ -494,7 +500,7 @@
|
||||
|
||||
var contentStream = formStream.Decode(filterProvider, pdfScanner);
|
||||
|
||||
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream), log);
|
||||
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentStream), parsingOptions.Logger);
|
||||
|
||||
// 3. We don't respect clipping currently.
|
||||
|
||||
@ -677,7 +683,7 @@
|
||||
|
||||
if (CurrentPath.IsClipping)
|
||||
{
|
||||
if (!clipPaths)
|
||||
if (!parsingOptions.ClipPaths)
|
||||
{
|
||||
// if we don't clip paths, add clipping path to paths
|
||||
paths.Add(CurrentPath);
|
||||
@ -717,9 +723,9 @@
|
||||
CurrentPath.FillColor = currentState.CurrentNonStrokingColor;
|
||||
}
|
||||
|
||||
if (clipPaths)
|
||||
if (parsingOptions.ClipPaths)
|
||||
{
|
||||
var clippedPath = currentState.CurrentClippingPath.Clip(CurrentPath, log);
|
||||
var clippedPath = currentState.CurrentClippingPath.Clip(CurrentPath, parsingOptions.Logger);
|
||||
if (clippedPath != null)
|
||||
{
|
||||
paths.Add(clippedPath);
|
||||
@ -745,15 +751,15 @@
|
||||
AddCurrentSubpath();
|
||||
CurrentPath.SetClipping(clippingRule);
|
||||
|
||||
if (clipPaths)
|
||||
if (parsingOptions.ClipPaths)
|
||||
{
|
||||
var currentClipping = GetCurrentState().CurrentClippingPath;
|
||||
currentClipping.SetClipping(clippingRule);
|
||||
|
||||
var newClippings = CurrentPath.Clip(currentClipping, log);
|
||||
var newClippings = CurrentPath.Clip(currentClipping, parsingOptions.Logger);
|
||||
if (newClippings == null)
|
||||
{
|
||||
log.Warn("Empty clipping path found. Clipping path not updated.");
|
||||
parsingOptions.Logger.Warn("Empty clipping path found. Clipping path not updated.");
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -796,7 +802,7 @@
|
||||
{
|
||||
if (inlineImageBuilder != null)
|
||||
{
|
||||
log?.Error("Begin inline image (BI) command encountered while another inline image was active.");
|
||||
parsingOptions.Logger.Error("Begin inline image (BI) command encountered while another inline image was active.");
|
||||
}
|
||||
|
||||
inlineImageBuilder = new InlineImageBuilder();
|
||||
@ -806,7 +812,7 @@
|
||||
{
|
||||
if (inlineImageBuilder == null)
|
||||
{
|
||||
log?.Error("Begin inline image data (ID) command encountered without a corresponding begin inline image (BI) command.");
|
||||
parsingOptions.Logger.Error("Begin inline image data (ID) command encountered without a corresponding begin inline image (BI) command.");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -817,7 +823,7 @@
|
||||
{
|
||||
if (inlineImageBuilder == null)
|
||||
{
|
||||
log?.Error("End inline image (EI) command encountered without a corresponding begin inline image (BI) command.");
|
||||
parsingOptions.Logger.Error("End inline image (EI) command encountered without a corresponding begin inline image (BI) command.");
|
||||
return;
|
||||
}
|
||||
|
||||
|
34
src/UglyToad.PdfPig/InternalParsingOptions.cs
Normal file
34
src/UglyToad.PdfPig/InternalParsingOptions.cs
Normal file
@ -0,0 +1,34 @@
|
||||
namespace UglyToad.PdfPig;
|
||||
|
||||
using Logging;
|
||||
using System.Collections.Generic;
|
||||
|
||||
/// <summary>
|
||||
/// <see cref="ParsingOptions"/> but without being a public API/
|
||||
/// </summary>
|
||||
internal class InternalParsingOptions
|
||||
{
|
||||
public IReadOnlyList<string> Passwords { get; }
|
||||
|
||||
public bool UseLenientParsing { get; }
|
||||
|
||||
public bool ClipPaths { get; }
|
||||
|
||||
public bool SkipMissingFonts { get; }
|
||||
|
||||
public ILog Logger { get; }
|
||||
|
||||
public InternalParsingOptions(
|
||||
IReadOnlyList<string> passwords,
|
||||
bool useLenientParsing,
|
||||
bool clipPaths,
|
||||
bool skipMissingFonts,
|
||||
ILog logger)
|
||||
{
|
||||
Passwords = passwords;
|
||||
UseLenientParsing = useLenientParsing;
|
||||
ClipPaths = clipPaths;
|
||||
SkipMissingFonts = skipMissingFonts;
|
||||
Logger = logger;
|
||||
}
|
||||
}
|
@ -21,20 +21,20 @@
|
||||
private readonly IResourceStore resourceStore;
|
||||
private readonly ILookupFilterProvider filterProvider;
|
||||
private readonly IPageContentParser pageContentParser;
|
||||
private readonly ILog log;
|
||||
|
||||
public PageFactory(IPdfTokenScanner pdfScanner, IResourceStore resourceStore, ILookupFilterProvider filterProvider,
|
||||
IPageContentParser pageContentParser,
|
||||
ILog log)
|
||||
public PageFactory(
|
||||
IPdfTokenScanner pdfScanner,
|
||||
IResourceStore resourceStore,
|
||||
ILookupFilterProvider filterProvider,
|
||||
IPageContentParser pageContentParser)
|
||||
{
|
||||
this.resourceStore = resourceStore;
|
||||
this.filterProvider = filterProvider;
|
||||
this.pageContentParser = pageContentParser;
|
||||
this.log = log;
|
||||
this.pdfScanner = pdfScanner;
|
||||
}
|
||||
|
||||
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, bool clipPaths)
|
||||
public Page Create(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers, InternalParsingOptions parsingOptions)
|
||||
{
|
||||
if (dictionary == null)
|
||||
{
|
||||
@ -45,11 +45,11 @@
|
||||
|
||||
if (type != null && !type.Equals(NameToken.Page))
|
||||
{
|
||||
log?.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
|
||||
parsingOptions.Logger.Error($"Page {number} had its type specified as {type} rather than 'Page'.");
|
||||
}
|
||||
|
||||
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers);
|
||||
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox);
|
||||
MediaBox mediaBox = GetMediaBox(number, dictionary, pageTreeMembers, parsingOptions.Logger);
|
||||
CropBox cropBox = GetCropBox(dictionary, pageTreeMembers, mediaBox, parsingOptions.Logger);
|
||||
|
||||
var rotation = new PageRotationDegrees(pageTreeMembers.Rotation);
|
||||
if (dictionary.TryGet(NameToken.Rotate, pdfScanner, out NumericToken rotateToken))
|
||||
@ -130,7 +130,7 @@
|
||||
}
|
||||
}
|
||||
|
||||
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, clipPaths, mediaBox);
|
||||
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -143,7 +143,7 @@
|
||||
|
||||
var bytes = contentStream.Decode(filterProvider, pdfScanner);
|
||||
|
||||
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, clipPaths, mediaBox);
|
||||
content = GetContent(number, bytes, cropBox, userSpaceUnit, rotation, mediaBox, parsingOptions);
|
||||
}
|
||||
|
||||
var page = new Page(number, dictionary, mediaBox, cropBox, rotation, content,
|
||||
@ -158,18 +158,28 @@
|
||||
return page;
|
||||
}
|
||||
|
||||
private PageContent GetContent(int pageNumber, IReadOnlyList<byte> contentBytes, CropBox cropBox, UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation, bool clipPaths, MediaBox mediaBox)
|
||||
private PageContent GetContent(
|
||||
int pageNumber,
|
||||
IReadOnlyList<byte> contentBytes,
|
||||
CropBox cropBox,
|
||||
UserSpaceUnit userSpaceUnit,
|
||||
PageRotationDegrees rotation,
|
||||
MediaBox mediaBox,
|
||||
InternalParsingOptions parsingOptions)
|
||||
{
|
||||
var operations = pageContentParser.Parse(pageNumber, new ByteArrayInputBytes(contentBytes),
|
||||
log);
|
||||
parsingOptions.Logger);
|
||||
|
||||
var context = new ContentStreamProcessor(cropBox.Bounds, resourceStore, userSpaceUnit, rotation, pdfScanner,
|
||||
var context = new ContentStreamProcessor(
|
||||
cropBox.Bounds,
|
||||
resourceStore,
|
||||
userSpaceUnit,
|
||||
rotation,
|
||||
pdfScanner,
|
||||
pageContentParser,
|
||||
filterProvider,
|
||||
log,
|
||||
clipPaths,
|
||||
new PdfVector(mediaBox.Bounds.Width, mediaBox.Bounds.Height));
|
||||
new PdfVector(mediaBox.Bounds.Width, mediaBox.Bounds.Height),
|
||||
parsingOptions);
|
||||
|
||||
return context.Process(pageNumber, operations);
|
||||
}
|
||||
@ -185,7 +195,11 @@
|
||||
return spaceUnits;
|
||||
}
|
||||
|
||||
private CropBox GetCropBox(DictionaryToken dictionary, PageTreeMembers pageTreeMembers, MediaBox mediaBox)
|
||||
private CropBox GetCropBox(
|
||||
DictionaryToken dictionary,
|
||||
PageTreeMembers pageTreeMembers,
|
||||
MediaBox mediaBox,
|
||||
ILog log)
|
||||
{
|
||||
CropBox cropBox;
|
||||
if (dictionary.TryGet(NameToken.CropBox, out var cropBoxObject) &&
|
||||
@ -210,7 +224,11 @@
|
||||
return cropBox;
|
||||
}
|
||||
|
||||
private MediaBox GetMediaBox(int number, DictionaryToken dictionary, PageTreeMembers pageTreeMembers)
|
||||
private MediaBox GetMediaBox(
|
||||
int number,
|
||||
DictionaryToken dictionary,
|
||||
PageTreeMembers pageTreeMembers,
|
||||
ILog log)
|
||||
{
|
||||
MediaBox mediaBox;
|
||||
if (dictionary.TryGet(NameToken.MediaBox, out var mediaboxObject)
|
||||
|
@ -45,9 +45,25 @@
|
||||
|
||||
internal static PdfDocument Open(Stream stream, ParsingOptions options)
|
||||
{
|
||||
var initialPosition = stream.Position;
|
||||
|
||||
var streamInput = new StreamInputBytes(stream, false);
|
||||
|
||||
return Open(streamInput, options);
|
||||
try
|
||||
{
|
||||
return Open(streamInput, options);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
if (initialPosition != 0)
|
||||
{
|
||||
throw new InvalidOperationException(
|
||||
"Could not parse document due to an error, the input stream was not at position zero when provided to the Open method.",
|
||||
ex);
|
||||
}
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
private static PdfDocument Open(IInputBytes inputBytes, ParsingOptions options = null)
|
||||
@ -75,19 +91,28 @@
|
||||
passwords.Add(string.Empty);
|
||||
}
|
||||
|
||||
var document = OpenDocument(inputBytes, tokenScanner, options?.Logger ?? new NoOpLog(), isLenientParsing, passwords, clipPaths);
|
||||
var finalOptions = new InternalParsingOptions(
|
||||
passwords,
|
||||
isLenientParsing,
|
||||
clipPaths,
|
||||
options?.SkipMissingFonts ?? false,
|
||||
options?.Logger ?? new NoOpLog());
|
||||
|
||||
var document = OpenDocument(inputBytes, tokenScanner, finalOptions);
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
private static PdfDocument OpenDocument(IInputBytes inputBytes, ISeekableTokenScanner scanner, ILog log, bool isLenientParsing,
|
||||
IReadOnlyList<string> passwords, bool clipPaths)
|
||||
private static PdfDocument OpenDocument(
|
||||
IInputBytes inputBytes,
|
||||
ISeekableTokenScanner scanner,
|
||||
InternalParsingOptions parsingOptions)
|
||||
{
|
||||
var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
|
||||
|
||||
CrossReferenceTable crossReferenceTable = null;
|
||||
|
||||
var xrefValidator = new XrefOffsetValidator(log);
|
||||
var xrefValidator = new XrefOffsetValidator(parsingOptions.Logger);
|
||||
|
||||
// We're ok with this since our intent is to lazily load the cross reference table.
|
||||
// ReSharper disable once AccessToModifiedClosure
|
||||
@ -95,30 +120,39 @@
|
||||
var pdfScanner = new PdfTokenScanner(inputBytes, locationProvider, filterProvider, NoOpEncryptionHandler.Instance);
|
||||
|
||||
var crossReferenceStreamParser = new CrossReferenceStreamParser(filterProvider);
|
||||
var crossReferenceParser = new CrossReferenceParser(log, xrefValidator, crossReferenceStreamParser);
|
||||
var crossReferenceParser = new CrossReferenceParser(parsingOptions.Logger, xrefValidator, crossReferenceStreamParser);
|
||||
|
||||
var version = FileHeaderParser.Parse(scanner, inputBytes, isLenientParsing, log);
|
||||
var version = FileHeaderParser.Parse(scanner, inputBytes, parsingOptions.UseLenientParsing, parsingOptions.Logger);
|
||||
|
||||
var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(inputBytes, scanner,
|
||||
isLenientParsing) + version.OffsetInFile;
|
||||
var crossReferenceOffset = FileTrailerParser.GetFirstCrossReferenceOffset(
|
||||
inputBytes,
|
||||
scanner,
|
||||
parsingOptions.UseLenientParsing) + version.OffsetInFile;
|
||||
|
||||
// TODO: make this use the scanner.
|
||||
var validator = new CrossReferenceOffsetValidator(xrefValidator);
|
||||
|
||||
crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, inputBytes, isLenientParsing);
|
||||
crossReferenceOffset = validator.Validate(crossReferenceOffset, scanner, inputBytes, parsingOptions.UseLenientParsing);
|
||||
|
||||
crossReferenceTable = crossReferenceParser.Parse(inputBytes, isLenientParsing,
|
||||
crossReferenceTable = crossReferenceParser.Parse(
|
||||
inputBytes,
|
||||
parsingOptions.UseLenientParsing,
|
||||
crossReferenceOffset,
|
||||
version.OffsetInFile,
|
||||
pdfScanner,
|
||||
scanner);
|
||||
|
||||
var (rootReference, rootDictionary) = ParseTrailer(crossReferenceTable, isLenientParsing,
|
||||
var (rootReference, rootDictionary) = ParseTrailer(
|
||||
crossReferenceTable,
|
||||
parsingOptions.UseLenientParsing,
|
||||
pdfScanner,
|
||||
out var encryptionDictionary);
|
||||
|
||||
var encryptionHandler = encryptionDictionary != null ?
|
||||
(IEncryptionHandler)new EncryptionHandler(encryptionDictionary, crossReferenceTable.Trailer, passwords)
|
||||
(IEncryptionHandler)new EncryptionHandler(
|
||||
encryptionDictionary,
|
||||
crossReferenceTable.Trailer,
|
||||
parsingOptions.Passwords)
|
||||
: NoOpEncryptionHandler.Instance;
|
||||
|
||||
pdfScanner.UpdateEncryptionHandler(encryptionHandler);
|
||||
@ -128,35 +162,45 @@
|
||||
|
||||
var type1Handler = new Type1FontHandler(pdfScanner, filterProvider, encodingReader);
|
||||
|
||||
var fontFactory = new FontFactory(log, new Type0FontHandler(cidFontFactory,
|
||||
var fontFactory = new FontFactory(parsingOptions.Logger, new Type0FontHandler(cidFontFactory,
|
||||
filterProvider, pdfScanner),
|
||||
new TrueTypeFontHandler(log, pdfScanner, filterProvider, encodingReader, SystemFontFinder.Instance,
|
||||
new TrueTypeFontHandler(parsingOptions.Logger, pdfScanner, filterProvider, encodingReader, SystemFontFinder.Instance,
|
||||
type1Handler),
|
||||
type1Handler,
|
||||
new Type3FontHandler(pdfScanner, filterProvider, encodingReader));
|
||||
|
||||
var resourceContainer = new ResourceStore(pdfScanner, fontFactory);
|
||||
|
||||
var information = DocumentInformationFactory.Create(pdfScanner, crossReferenceTable.Trailer, isLenientParsing);
|
||||
var information = DocumentInformationFactory.Create(
|
||||
pdfScanner,
|
||||
crossReferenceTable.Trailer,
|
||||
parsingOptions.UseLenientParsing);
|
||||
|
||||
var catalog = CatalogFactory.Create(rootReference, rootDictionary, pdfScanner, isLenientParsing);
|
||||
var catalog = CatalogFactory.Create(
|
||||
rootReference,
|
||||
rootDictionary,
|
||||
pdfScanner,
|
||||
parsingOptions.UseLenientParsing);
|
||||
|
||||
var pageFactory = new PageFactory(pdfScanner, resourceContainer, filterProvider,
|
||||
new PageContentParser(new ReflectionGraphicsStateOperationFactory()),
|
||||
log);
|
||||
|
||||
var caching = new ParsingCachingProviders(resourceContainer);
|
||||
new PageContentParser(new ReflectionGraphicsStateOperationFactory()));
|
||||
|
||||
var acroFormFactory = new AcroFormFactory(pdfScanner, filterProvider, crossReferenceTable);
|
||||
var bookmarksProvider = new BookmarksProvider(log, pdfScanner);
|
||||
var bookmarksProvider = new BookmarksProvider(parsingOptions.Logger, pdfScanner);
|
||||
|
||||
return new PdfDocument(log, inputBytes, version, crossReferenceTable, caching, pageFactory, catalog, information,
|
||||
return new PdfDocument(
|
||||
inputBytes,
|
||||
version,
|
||||
crossReferenceTable,
|
||||
pageFactory,
|
||||
catalog,
|
||||
information,
|
||||
encryptionDictionary,
|
||||
pdfScanner,
|
||||
filterProvider,
|
||||
acroFormFactory,
|
||||
bookmarksProvider,
|
||||
clipPaths);
|
||||
parsingOptions);
|
||||
}
|
||||
|
||||
private static (IndirectReference, DictionaryToken) ParseTrailer(CrossReferenceTable crossReferenceTable, bool isLenientParsing, IPdfTokenScanner pdfTokenScanner,
|
||||
|
@ -48,5 +48,11 @@
|
||||
/// All passwords to try when opening this document, will include any values set for <see cref="Password"/>.
|
||||
/// </summary>
|
||||
public List<string> Passwords { get; set; } = new List<string>();
|
||||
|
||||
/// <summary>
|
||||
/// Skip extracting content where the font could not be found, will result in some letters being skipped/missed
|
||||
/// but will prevent the library throwing where the source PDF has some corrupted text.
|
||||
/// </summary>
|
||||
public bool SkipMissingFonts { get; set; } = false;
|
||||
}
|
||||
}
|
@ -10,7 +10,6 @@
|
||||
using Encryption;
|
||||
using Exceptions;
|
||||
using Filters;
|
||||
using Logging;
|
||||
using Parser;
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
@ -28,16 +27,9 @@
|
||||
|
||||
[NotNull]
|
||||
private readonly HeaderVersion version;
|
||||
|
||||
private readonly ILog log;
|
||||
|
||||
private readonly IInputBytes inputBytes;
|
||||
|
||||
private readonly bool clipPaths;
|
||||
|
||||
[NotNull]
|
||||
private readonly ParsingCachingProviders cachingProviders;
|
||||
|
||||
[CanBeNull]
|
||||
private readonly EncryptionDictionary encryptionDictionary;
|
||||
|
||||
@ -46,6 +38,7 @@
|
||||
|
||||
private readonly ILookupFilterProvider filterProvider;
|
||||
private readonly BookmarksProvider bookmarksProvider;
|
||||
private readonly InternalParsingOptions parsingOptions;
|
||||
|
||||
[NotNull]
|
||||
private readonly Pages pages;
|
||||
@ -82,11 +75,10 @@
|
||||
/// </summary>
|
||||
public bool IsEncrypted => encryptionDictionary != null;
|
||||
|
||||
internal PdfDocument(ILog log,
|
||||
internal PdfDocument(
|
||||
IInputBytes inputBytes,
|
||||
HeaderVersion version,
|
||||
CrossReferenceTable crossReferenceTable,
|
||||
ParsingCachingProviders cachingProviders,
|
||||
IPageFactory pageFactory,
|
||||
Catalog catalog,
|
||||
DocumentInformation information,
|
||||
@ -95,17 +87,16 @@
|
||||
ILookupFilterProvider filterProvider,
|
||||
AcroFormFactory acroFormFactory,
|
||||
BookmarksProvider bookmarksProvider,
|
||||
bool clipPaths)
|
||||
InternalParsingOptions parsingOptions)
|
||||
{
|
||||
this.log = log;
|
||||
this.inputBytes = inputBytes;
|
||||
this.version = version ?? throw new ArgumentNullException(nameof(version));
|
||||
this.cachingProviders = cachingProviders ?? throw new ArgumentNullException(nameof(cachingProviders));
|
||||
this.encryptionDictionary = encryptionDictionary;
|
||||
this.pdfScanner = pdfScanner ?? throw new ArgumentNullException(nameof(pdfScanner));
|
||||
this.filterProvider = filterProvider ?? throw new ArgumentNullException(nameof(filterProvider));
|
||||
this.bookmarksProvider = bookmarksProvider ?? throw new ArgumentNullException(nameof(bookmarksProvider));
|
||||
this.clipPaths = clipPaths;
|
||||
this.parsingOptions = parsingOptions;
|
||||
|
||||
Information = information ?? throw new ArgumentNullException(nameof(information));
|
||||
pages = new Pages(catalog, pageFactory, pdfScanner);
|
||||
Structure = new Structure(catalog, crossReferenceTable, pdfScanner);
|
||||
@ -153,11 +144,11 @@
|
||||
throw new ObjectDisposedException("Cannot access page after the document is disposed.");
|
||||
}
|
||||
|
||||
log.Debug($"Accessing page {pageNumber}.");
|
||||
parsingOptions.Logger.Debug($"Accessing page {pageNumber}.");
|
||||
|
||||
try
|
||||
{
|
||||
return pages.GetPage(pageNumber, clipPaths);
|
||||
return pages.GetPage(pageNumber, parsingOptions);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
@ -258,7 +249,7 @@
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
log.Error("Failed disposing the PdfDocument due to an error.", ex);
|
||||
parsingOptions.Logger.Error("Failed disposing the PdfDocument due to an error.", ex);
|
||||
}
|
||||
finally
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user