Support reading files with missing white space after xref in lenient mode (#906)

Support missing white space after xref

---------

Co-authored-by: Arnaud TAMAILLON <arnaud.tamaillon@younited-credit.fr>
This commit is contained in:
Arnaud TAMAILLON 2024-09-09 08:09:04 +02:00 committed by GitHub
parent 09bddba778
commit 4845f43696
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 65 additions and 7 deletions

View File

@ -0,0 +1,13 @@
namespace UglyToad.PdfPig.Tests.Integration
{
public class CrossReferenceParserTests
{
[Fact]
public void CanReadDocumentWithMissingWhitespaceAfterXRef()
{
string path = IntegrationHelpers.GetSpecificTestDocumentPath("xref-with-no-whitespace.pdf");
using var document = PdfDocument.Open(path);
Assert.Equal(3, document.NumberOfPages);
}
}
}

View File

@ -300,18 +300,42 @@ trailer
trailer
<< >>";
// Strict parsing
var input = StringBytesTestConverter.Scanner(data);
var act = () => CrossReferenceTableParser.Parse(input.scanner, 0, false);
var input = GetReader(data);
var act = () => CrossReferenceTableParser.Parse(input, 0, false);
var ex = Assert.Throws<PdfDocumentFormatException>(act);
Assert.Equal("Found a line with 2 unexpected entries in the cross reference table: 127, 0.", ex.Message);
// Lenient Parsing
input = StringBytesTestConverter.Scanner(data);
var result = CrossReferenceTableParser.Parse(input.scanner, 0, true);
input = GetReader(data);
var result = CrossReferenceTableParser.Parse(input, 0, true);
Assert.Equal(6, result.ObjectOffsets.Count);
}
[Fact]
public void ParsesMissingWhitespaceAfterXref()
{
var data = @"xref15 2
0000000190 00000 n
0000000250 00032 n
trailer
<<>>";
var input = GetReader(data);
// Strict parsing
var act = () => CrossReferenceTableParser.Parse(input, 0, false);
var ex = Assert.Throws<PdfDocumentFormatException>(act);
Assert.Equal("Unexpected operator in xref position: xref15.", ex.Message);
// Lenient Parsing
input = GetReader(data);
var result = CrossReferenceTableParser.Parse(input, 0, true);
Assert.Equal(2, result.ObjectOffsets.Count);
}
private static CoreTokenScanner GetReader(string input)
{
return StringBytesTestConverter.Scanner(input).scanner;

View File

@ -59,7 +59,7 @@
tokenScanner.MoveNext();
if (tokenScanner.CurrentToken is OperatorToken tableToken && tableToken.Data == "xref")
if (CrossReferenceTableParser.IsCrossReferenceMarker(tokenScanner, isLenientParsing))
{
missedAttempts = 0;
log.Debug("Element was cross reference table.");

View File

@ -13,7 +13,7 @@
{
private const string InUseEntry = "n";
private const string FreeEntry = "f";
public static CrossReferenceTablePart Parse(ISeekableTokenScanner scanner, long offset, bool isLenientParsing)
{
var builder = new CrossReferenceTablePartBuilder
@ -31,10 +31,22 @@
if (scanner.CurrentToken is OperatorToken operatorToken)
{
if (operatorToken.Data == "xref")
if (operatorToken.Data == OperatorToken.Xref.Data)
{
scanner.MoveNext();
}
else if (isLenientParsing)
{
if (operatorToken.Data.StartsWith(OperatorToken.Xref.Data))
{
scanner.Seek(scanner.CurrentPosition - operatorToken.Data.Length + OperatorToken.Xref.Data.Length);
scanner.MoveNext();
}
else
{
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
}
}
else
{
throw new PdfDocumentFormatException($"Unexpected operator in xref position: {operatorToken}.");
@ -106,6 +118,15 @@
return builder.Build();
}
public static bool IsCrossReferenceMarker(ISeekableTokenScanner scanner, bool isLenientParsing)
{
return (scanner.CurrentToken is OperatorToken operatorToken
&& (operatorToken.Data == OperatorToken.Xref.Data
|| (isLenientParsing
&& operatorToken.Data.StartsWith(OperatorToken.Xref.Data)
&& int.TryParse(operatorToken.Data.Substring(OperatorToken.Xref.Data.Length), out _))));
}
private static int ProcessTokens(ReadOnlySpan<IToken> tokens, CrossReferenceTablePartBuilder builder, bool isLenientParsing,
int objectCount, ref TableSubsectionDefinition definition)
{