Introduce ParsingOptions.FilterProvider and BaseFilterProvider and make CcittFaxCompressionType a byte

This commit is contained in:
BobLd 2024-10-15 22:51:22 +01:00
parent 4b5cb4736f
commit 8cee4f480f
8 changed files with 242 additions and 81 deletions

View File

@ -0,0 +1,126 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using PdfPig.Filters;
using PdfPig.Tokens;
using System;
using System.Collections.Generic;
using System.Linq;
public class FilterTests
{
private static readonly Lazy<string> DocumentFolder = new Lazy<string>(() => Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents")));
private static readonly HashSet<string> _documentsToIgnore =
[
"issue_671.pdf",
"GHOSTSCRIPT-698363-0.pdf",
"ErcotFacts.pdf"
];
[Theory]
[MemberData(nameof(GetAllDocuments))]
public void NoImageDecoding(string documentName)
{
// Add the full path back on, we removed it so we could see it in the test explorer.
documentName = Path.Combine(DocumentFolder.Value, documentName);
var parsingOptions = new ParsingOptions
{
UseLenientParsing = true,
FilterProvider = MyFilterProvider.Instance
};
using (var document = PdfDocument.Open(documentName, parsingOptions))
{
for (var i = 0; i < document.NumberOfPages; i++)
{
var page = document.GetPage(i + 1);
foreach (var pdfImage in page.GetImages())
{
if (pdfImage.ImageDictionary.TryGet(NameToken.Filter, out NameToken filter))
{
if (filter.Data.Equals(NameToken.FlateDecode.Data) ||
filter.Data.Equals(NameToken.FlateDecodeAbbreviation.Data) ||
filter.Data.Equals(NameToken.LzwDecode.Data) ||
filter.Data.Equals(NameToken.LzwDecodeAbbreviation.Data))
{
continue;
}
}
else
{
continue;
}
Assert.False(pdfImage.TryGetPng(out _));
}
}
}
}
public sealed class NoFilter : IFilter
{
public bool IsSupported => false;
public ReadOnlyMemory<byte> Decode(ReadOnlySpan<byte> input, DictionaryToken streamDictionary, int filterIndex)
{
throw new NotImplementedException();
}
}
public class MyFilterProvider : BaseFilterProvider
{
/// <summary>
/// The single instance of this provider.
/// </summary>
public static readonly IFilterProvider Instance = new MyFilterProvider();
/// <inheritdoc/>
protected MyFilterProvider() : base(GetDictionary())
{
}
private static Dictionary<string, IFilter> GetDictionary()
{
var ascii85 = new Ascii85Filter();
var asciiHex = new AsciiHexDecodeFilter();
var flate = new FlateFilter();
var runLength = new RunLengthFilter();
var lzw = new LzwFilter();
var noFilter = new NoFilter();
return new Dictionary<string, IFilter>
{
{ NameToken.Ascii85Decode.Data, ascii85 },
{ NameToken.Ascii85DecodeAbbreviation.Data, ascii85 },
{ NameToken.AsciiHexDecode.Data, asciiHex },
{ NameToken.AsciiHexDecodeAbbreviation.Data, asciiHex },
{ NameToken.CcittfaxDecode.Data, noFilter },
{ NameToken.CcittfaxDecodeAbbreviation.Data, noFilter },
{ NameToken.DctDecode.Data, noFilter },
{ NameToken.DctDecodeAbbreviation.Data, noFilter },
{ NameToken.FlateDecode.Data, flate },
{ NameToken.FlateDecodeAbbreviation.Data, flate },
{ NameToken.Jbig2Decode.Data, noFilter },
{ NameToken.JpxDecode.Data, noFilter },
{ NameToken.RunLengthDecode.Data, runLength },
{ NameToken.RunLengthDecodeAbbreviation.Data, runLength },
{NameToken.LzwDecode, lzw },
{NameToken.LzwDecodeAbbreviation, lzw }
};
}
}
public static IEnumerable<object[]> GetAllDocuments
{
get
{
var files = Directory.GetFiles(DocumentFolder.Value, "*.pdf");
// Return the shortname so we can see it in the test explorer.
return files.Where(x => !_documentsToIgnore.Any(i => x.EndsWith(i))).Select(x => new object[] { Path.GetFileName(x) });
}
}
}
}

View File

@ -97,6 +97,7 @@
"UglyToad.PdfPig.CrossReference.CrossReferenceType",
"UglyToad.PdfPig.CrossReference.TrailerDictionary",
"UglyToad.PdfPig.Exceptions.PdfDocumentEncryptedException",
"UglyToad.PdfPig.Filters.BaseFilterProvider",
"UglyToad.PdfPig.Filters.DefaultFilterProvider",
"UglyToad.PdfPig.Filters.IFilter",
"UglyToad.PdfPig.Filters.IFilterProvider",

View File

@ -0,0 +1,96 @@
namespace UglyToad.PdfPig.Filters
{
using Core;
using System;
using System.Collections.Generic;
using System.Linq;
using Tokens;
using Util;
/// <summary>
/// Base abstract class for FilterProvider.
/// </summary>
public abstract class BaseFilterProvider : IFilterProvider
{
/// <summary>
/// Dictionary of filters.
/// </summary>
protected readonly IReadOnlyDictionary<string, IFilter> FilterInstances;
/// <summary>
/// Create a new <see cref="BaseFilterProvider"/> with the given filters.
/// </summary>
/// <param name="filterInstances"></param>
protected BaseFilterProvider(IReadOnlyDictionary<string, IFilter> filterInstances)
{
FilterInstances = filterInstances;
}
/// <inheritdoc />
public IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary)
{
if (dictionary is null)
{
throw new ArgumentNullException(nameof(dictionary));
}
var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F);
if (token is null)
{
return Array.Empty<IFilter>();
}
switch (token)
{
case ArrayToken filters:
var result = new IFilter[filters.Data.Count];
for (var i = 0; i < filters.Data.Count; i++)
{
var filterToken = filters.Data[i];
var filterName = ((NameToken)filterToken).Data;
result[i] = GetFilterStrict(filterName);
}
return result;
case NameToken name:
return new[] { GetFilterStrict(name.Data) };
default:
throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}.");
}
}
/// <inheritdoc />
public IReadOnlyList<IFilter> GetNamedFilters(IReadOnlyList<NameToken> names)
{
if (names is null)
{
throw new ArgumentNullException(nameof(names));
}
var result = new List<IFilter>();
foreach (var name in names)
{
result.Add(GetFilterStrict(name));
}
return result;
}
private IFilter GetFilterStrict(string name)
{
if (!FilterInstances.TryGetValue(name, out var factory))
{
throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue.");
}
return factory;
}
/// <inheritdoc />
public IReadOnlyList<IFilter> GetAllFilters()
{
return FilterInstances.Values.Distinct().ToList();
}
}
}

View File

@ -3,7 +3,7 @@
/// <summary>
/// Specifies the compression type to use with <see cref="T:UglyToad.PdfPig.Filters.CcittFaxDecoderStream" />.
/// </summary>
internal enum CcittFaxCompressionType
internal enum CcittFaxCompressionType : byte
{
/// <summary>
/// Modified Huffman (MH) - Group 3 variation (T2)

View File

@ -1,25 +1,24 @@
namespace UglyToad.PdfPig.Filters
{
using System;
using System.Collections.Generic;
using System.Linq;
using Core;
using Tokens;
using Util;
/// <summary>
/// The default implementation of the <see cref="T:UglyToad.PdfPig.Filters.IFilterProvider" />.
/// </summary>
public class DefaultFilterProvider : IFilterProvider
public sealed class DefaultFilterProvider : BaseFilterProvider
{
private readonly IReadOnlyDictionary<string, IFilter> filterInstances;
/// <summary>
/// The single instance of this provider.
/// </summary>
public static readonly IFilterProvider Instance = new DefaultFilterProvider();
private DefaultFilterProvider()
/// <inheritdoc/>
private DefaultFilterProvider() : base(GetDictionary())
{
}
private static Dictionary<string, IFilter> GetDictionary()
{
var ascii85 = new Ascii85Filter();
var asciiHex = new AsciiHexDecodeFilter();
@ -31,7 +30,7 @@
var runLength = new RunLengthFilter();
var lzw = new LzwFilter();
filterInstances = new Dictionary<string, IFilter>
return new Dictionary<string, IFilter>
{
{ NameToken.Ascii85Decode.Data, ascii85 },
{ NameToken.Ascii85DecodeAbbreviation.Data, ascii85 },
@ -47,77 +46,9 @@
{ NameToken.JpxDecode.Data, jpx },
{ NameToken.RunLengthDecode.Data, runLength },
{ NameToken.RunLengthDecodeAbbreviation.Data, runLength },
{NameToken.LzwDecode, lzw },
{NameToken.LzwDecodeAbbreviation, lzw }
{ NameToken.LzwDecode.Data, lzw },
{ NameToken.LzwDecodeAbbreviation.Data, lzw }
};
}
/// <inheritdoc />
public IReadOnlyList<IFilter> GetFilters(DictionaryToken dictionary)
{
if (dictionary is null)
{
throw new ArgumentNullException(nameof(dictionary));
}
var token = dictionary.GetObjectOrDefault(NameToken.Filter, NameToken.F);
if (token is null)
{
return Array.Empty<IFilter>();
}
switch (token)
{
case ArrayToken filters:
var result = new IFilter[filters.Data.Count];
for (var i = 0; i < filters.Data.Count; i++)
{
var filterToken = filters.Data[i];
var filterName = ((NameToken) filterToken).Data;
result[i] = GetFilterStrict(filterName);
}
return result;
case NameToken name:
return new[] { GetFilterStrict(name.Data) };
default:
throw new PdfDocumentFormatException($"The filter for the stream was not a valid object. Expected name or array, instead got: {token}.");
}
}
/// <inheritdoc />
public IReadOnlyList<IFilter> GetNamedFilters(IReadOnlyList<NameToken> names)
{
if (names is null)
{
throw new ArgumentNullException(nameof(names));
}
var result = new List<IFilter>();
foreach (var name in names)
{
result.Add(GetFilterStrict(name));
}
return result;
}
private IFilter GetFilterStrict(string name)
{
if (!filterInstances.TryGetValue(name, out var factory))
{
throw new NotSupportedException($"The filter with the name {name} is not supported yet. Please raise an issue.");
}
return factory;
}
/// <inheritdoc />
public IReadOnlyList<IFilter> GetAllFilters()
{
return filterInstances.Values.Distinct().ToList();
}
}
}

View File

@ -85,6 +85,7 @@
}
}
/// <inheritdoc />
public byte[] Encode(Stream input, DictionaryToken streamDictionary, int index)
{
const int headerLength = 2;

View File

@ -106,7 +106,7 @@
ISeekableTokenScanner scanner,
ParsingOptions parsingOptions)
{
var filterProvider = new FilterProviderWithLookup(DefaultFilterProvider.Instance);
var filterProvider = new FilterProviderWithLookup(parsingOptions.FilterProvider ?? DefaultFilterProvider.Instance);
CrossReferenceTable? crossReferenceTable = null;

View File

@ -1,5 +1,6 @@
namespace UglyToad.PdfPig
{
using Filters;
using System.Collections.Generic;
using Logging;
@ -50,5 +51,10 @@
/// forms and images when missing.
/// </summary>
public bool SkipMissingFonts { get; set; } = false;
/// <summary>
/// Filter provider to use while parsing the document. The <see cref="DefaultFilterProvider"/> will be used if set to <c>null</c>.
/// </summary>
public IFilterProvider? FilterProvider { get; set; } = null;
}
}