Merge pull request #527 from fnatzke/Issue350-PdfPageBuilder.CopyFrom()_creates_invalid_PDF

Issue350 pdf page builder.copy from() creates invalid pdf
This commit is contained in:
Eliot Jones 2023-03-17 18:15:06 +01:00 committed by GitHub
commit 68c00c9b51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 73 additions and 1 deletions

View File

@ -0,0 +1,48 @@
namespace UglyToad.PdfPig.Tests.Integration
{
using System.Linq;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Writer;
using Xunit;
public class ShowTextEscapeText
{
private static string GetFilename()
{
// On the single page of the source PDF has 3 ShowText operations with unbalanced round brackets in the text.
// Unbalanced meaning there is an open bracket without a close bracket or close without open.
// 1. line 387 (\() Tj
// 2. line 397 (\)) Tj
// 3. line 384 ( \(I\222ll try to stay on ) Tj
// note in text 3 the 0222 (octal) or (0x92 hex) is similar to an apostople ' so text is similar to " (I'll try to stay on" (with an open bracket).
return IntegrationHelpers.GetDocumentPath("ShowTextOpWithUnbalancedRoundBrackets.pdf");
}
[Fact]
public void PdfCopyShowTextOpUsesEscapedText()
{
var filePath = GetFilename();
using (var sourceDocument = PdfDocument.Open(filePath))
{
PdfDocumentBuilder pdfBuilder = new PdfDocumentBuilder();
var numberOfPages = sourceDocument.NumberOfPages;
int pageNumber = 1; ////for (int pageNumber = 1; pageNumber <= numberOfPages; pageNumber++)
{
var sourcePage = sourceDocument.GetPage(pageNumber);
pdfBuilder.AddPage(sourcePage.Width, sourcePage.Height).CopyFrom(sourcePage);
}
var pdfBytes = pdfBuilder.Build();
// Reread (in memory) copied PDF and check example text ("wander") exists in word extract after ShowText operation with unbalanced bracket.
using (var document = PdfDocument.Open(pdfBytes))
{
var page = document.GetPage(1);
var words = page.GetWords();
var isExpectedTextInCopiedPdf = words.Any(w => w.Text.Contains("wander"));
Assert.True(isExpectedTextInCopiedPdf);
}
}
}
}
}

View File

@ -67,16 +67,40 @@
operationContext.ShowText(input);
}
string EscapeText(string text)
{
if (text is null) return null;
// Fix Issue 350 from PDF Spec 1.7 (page 408) on handling 'special characters' of '(', ')' and '\'.
// The strings must conform to the syntax for string objects.
// When a string is written by enclosing the data in parentheses,
// bytes whose values are the same as those
// of the ASCII characters left parenthesis (40), right parenthesis (41), and backslash (92)
// must be preceded by a backslash character.
// All other byte values between 0 and 255 may be used in a string object.
// These rules apply to each individual byte in a string object, whether the string is interpreted by the text-showing operators
// as single-byte or multiple-byte character codes.
// Note: order of replacing is important. Replace slash first before brackets.
text = text.Replace(@"\", @"\\)"); // Escape any slash '\' -> '\\'
text = text.Replace("(", @"\("); // Escape any open brackets '(' -> '\('
text = text.Replace(")", @"\)"); // Escape any close brackets ')' -> '\)'
return text;
}
/// <inheritdoc />
public void Write(Stream stream)
{
if (Bytes != null)
{
stream.WriteHex(Bytes);
}
else
{
stream.WriteText($"({Text})");
var EscapedText = EscapeText(Text); // escape '(', ')' or '\'
stream.WriteText($"({EscapedText})");
}
stream.WriteWhiteSpace();