diff --git a/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs b/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs index 67c3247a..01fc3a18 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/FarmerMacTests.cs @@ -42,6 +42,5 @@ Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text); } } - } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Tests/Integration/Math119FakingDataTests.cs b/src/UglyToad.PdfPig.Tests/Integration/Math119FakingDataTests.cs new file mode 100644 index 00000000..9055c9bd --- /dev/null +++ b/src/UglyToad.PdfPig.Tests/Integration/Math119FakingDataTests.cs @@ -0,0 +1,18 @@ +namespace UglyToad.PdfPig.Tests.Integration; + +using Xunit; + +public class Math119FakingDataTests +{ + [Fact] + public void CombinesDiaeresisForWords() + { + using var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData.pdf")); + + var lastPage = document.GetPage(8); + + var words = lastPage.GetWords(); + + + } +} \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs index 95ee0495..52760142 100644 --- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs +++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs @@ -17,6 +17,7 @@ using Tokenization.Scanner; using Tokens; using Operations.TextPositioning; + using Util; using XObjects; using static PdfPig.Core.PdfSubpath; @@ -293,15 +294,58 @@ ? currentState.CurrentNonStrokingColor : currentState.CurrentStrokingColor; - var letter = new Letter(unicode, transformedGlyphBounds, - transformedPdfBounds.BottomLeft, - transformedPdfBounds.BottomRight, - transformedPdfBounds.Width, - fontSize, - font.Details, - color, - pointSize, - textSequence); + Letter letter = null; + if (Diacritics.IsInCombiningDiacriticRange(unicode) && bytes.CurrentOffset > 0 && letters.Count > 0) + { + var attachTo = letters[letters.Count - 1]; + + if (attachTo.TextSequence == textSequence + && Diacritics.TryCombineDiacriticWithPreviousLetter(unicode, attachTo.Value, out var newLetter)) + { + // TODO: union of bounding boxes. + letters.Remove(attachTo); + + letter = new Letter( + newLetter, + attachTo.GlyphRectangle, + attachTo.StartBaseLine, + attachTo.EndBaseLine, + attachTo.Width, + attachTo.FontSize, + attachTo.Font, + attachTo.Color, + attachTo.PointSize, + attachTo.TextSequence); + } + else + { + letter = new Letter( + unicode, + transformedGlyphBounds, + transformedPdfBounds.BottomLeft, + transformedPdfBounds.BottomRight, + transformedPdfBounds.Width, + fontSize, + font.Details, + color, + pointSize, + textSequence); + } + } + else + { + letter = new Letter( + unicode, + transformedGlyphBounds, + transformedPdfBounds.BottomLeft, + transformedPdfBounds.BottomRight, + transformedPdfBounds.Width, + fontSize, + font.Details, + color, + pointSize, + textSequence); + } letters.Add(letter); diff --git a/src/UglyToad.PdfPig/Util/Diacritics.cs b/src/UglyToad.PdfPig/Util/Diacritics.cs new file mode 100644 index 00000000..f171ea04 --- /dev/null +++ b/src/UglyToad.PdfPig/Util/Diacritics.cs @@ -0,0 +1,75 @@ +namespace UglyToad.PdfPig.Util +{ + using System; + using System.Collections.Generic; + using System.Globalization; + + internal static class Diacritics + { + private static readonly HashSet NonCombiningDiacritics = new HashSet + { + "´", + "^", + "ˆ", + "¨", + "©", + "™", + "®", + "`", + "˜", + "∼", + "¸" + }; + + public static bool IsPotentialStandaloneDiacritic(string value) => NonCombiningDiacritics.Contains(value); + + public static bool IsInCombiningDiacriticRange(string value) + { + if (value.Length != 1) + { + return false; + } + + var intVal = (int)value[0]; + + if (intVal >= 768 && intVal <= 879) + { + return true; + } + + return false; + } + + public static bool TryCombineDiacriticWithPreviousLetter(string diacritic, string previous, out string result) + { + result = null; + + if (previous == null) + { + return false; + } + + result = previous + diacritic; + + // On combining the length should remain equal. + var beforeCombination = MeasureDiacriticAwareLength(previous); + var afterCombination = MeasureDiacriticAwareLength(result); + + return beforeCombination == afterCombination; + } + + private static int MeasureDiacriticAwareLength(string input) + { + var length = 0; + + var enumerator = StringInfo.GetTextElementEnumerator(input); + while (enumerator.MoveNext()) + { + var grapheme = enumerator.GetTextElement(); + length++; + } + + return length; + } + } +}