From b5b15ee593c541e33f88c13c97dc22f419dd09e7 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Thu, 14 Apr 2022 20:14:09 -0400 Subject: [PATCH 1/2] add handling for combining diacritics --- .../Integration/FarmerMacTests.cs | 1 - .../Integration/Math119FakingDataTests.cs | 18 +++++ .../Graphics/ContentStreamProcessor.cs | 62 ++++++++++++--- src/UglyToad.PdfPig/Util/Diacritics.cs | 75 +++++++++++++++++++ 4 files changed, 146 insertions(+), 10 deletions(-) create mode 100644 src/UglyToad.PdfPig.Tests/Integration/Math119FakingDataTests.cs create mode 100644 src/UglyToad.PdfPig/Util/Diacritics.cs diff --git a/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs b/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs index 67c3247a..01fc3a18 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs @@ -42,6 +42,5 @@ Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text); } } - } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Tests/Integration/Math119FakingDataTests.cs b/src/UglyToad.PdfPig.Tests/Integration/Math119FakingDataTests.cs new file mode 100644 index 00000000..9055c9bd --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/Math119FakingDataTests.cs @@ -0,0 +1,18 @@ +namespace UglyToad.PdfPig.Tests.Integration; + +using Xunit; + +public class Math119FakingDataTests +{ + [Fact] + public void CombinesDiaeresisForWords() + { + using var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData.pdf")); + + var lastPage = document.GetPage(8); + + var words = lastPage.GetWords(); + + + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index 95ee0495..33854211 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -17,6 +17,7 @@ using Tokenization.Scanner; using Tokens; using Operations.TextPositioning; + using Util; using XObjects; using static PdfPig.Core.PdfSubpath; @@ -293,15 +294,58 @@ ? currentState.CurrentNonStrokingColor : currentState.CurrentStrokingColor; - var letter = new Letter(unicode, transformedGlyphBounds, - transformedPdfBounds.BottomLeft, - transformedPdfBounds.BottomRight, - transformedPdfBounds.Width, - fontSize, - font.Details, - color, - pointSize, - textSequence); + Letter letter = null; + if (Diacritics.IsInCombiningDiacriticRange(unicode) && letters.Count > 0) + { + var attachTo = letters[letters.Count - 1]; + + if (attachTo.TextSequence == textSequence + && Diacritics.TryCombineDiacriticWithPreviousLetter(unicode, attachTo.Value, out var newLetter)) + { + // TODO: union of bounding boxes. + letters.Remove(attachTo); + + letter = new Letter( + newLetter, + attachTo.GlyphRectangle, + attachTo.StartBaseLine, + attachTo.EndBaseLine, + attachTo.Width, + attachTo.FontSize, + attachTo.Font, + attachTo.Color, + attachTo.PointSize, + attachTo.TextSequence); + } + else + { + letter = new Letter( + unicode, + transformedGlyphBounds, + transformedPdfBounds.BottomLeft, + transformedPdfBounds.BottomRight, + transformedPdfBounds.Width, + fontSize, + font.Details, + color, + pointSize, + textSequence); + } + } + else + { + letter = new Letter( + unicode, + transformedGlyphBounds, + transformedPdfBounds.BottomLeft, + transformedPdfBounds.BottomRight, + transformedPdfBounds.Width, + fontSize, + font.Details, + color, + pointSize, + textSequence); + } letters.Add(letter); diff --git a/src/UglyToad.PdfPig/Util/Diacritics.cs b/src/UglyToad.PdfPig/Util/Diacritics.cs new file mode 100644 index 00000000..f171ea04 --- /dev/null +++ b/src/UglyToad.PdfPig/Util/Diacritics.cs @@ -0,0 +1,75 @@ +namespace UglyToad.PdfPig.Util +{ + using System; + using System.Collections.Generic; + using System.Globalization; + + internal static class Diacritics + { + private static readonly HashSet NonCombiningDiacritics = new HashSet + { + "´", + "^", + "ˆ", + "¨", + "©", + "™", + "®", + "`", + "˜", + "∼", + "¸" + }; + + public static bool IsPotentialStandaloneDiacritic(string value) => NonCombiningDiacritics.Contains(value); + + public static bool IsInCombiningDiacriticRange(string value) + { + if (value.Length != 1) + { + return false; + } + + var intVal = (int)value[0]; + + if (intVal >= 768 && intVal <= 879) + { + return true; + } + + return false; + } + + public static bool TryCombineDiacriticWithPreviousLetter(string diacritic, string previous, out string result) + { + result = null; + + if (previous == null) + { + return false; + } + + result = previous + diacritic; + + // On combining the length should remain equal. + var beforeCombination = MeasureDiacriticAwareLength(previous); + var afterCombination = MeasureDiacriticAwareLength(result); + + return beforeCombination == afterCombination; + } + + private static int MeasureDiacriticAwareLength(string input) + { + var length = 0; + + var enumerator = StringInfo.GetTextElementEnumerator(input); + while (enumerator.MoveNext()) + { + var grapheme = enumerator.GetTextElement(); + length++; + } + + return length; + } + } +} From eb0758f050bd0432e18b66972ffd5eefb751e199 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Thu, 14 Apr 2022 20:22:49 -0400 Subject: [PATCH 2/2] only combine when it forms part of the same byte sequence --- src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index 33854211..52760142 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -295,7 +295,7 @@ : currentState.CurrentStrokingColor; Letter letter = null; - if (Diacritics.IsInCombiningDiacriticRange(unicode) && letters.Count > 0) + if (Diacritics.IsInCombiningDiacriticRange(unicode) && bytes.CurrentOffset > 0 && letters.Count > 0) { var attachTo = letters[letters.Count - 1];