add handling for combining diacritics

This commit is contained in:
Eliot Jones 2022-04-14 20:14:09 -04:00
parent 4bcbb0655c
commit b5b15ee593
4 changed files with 146 additions and 10 deletions

View File

@ -42,6 +42,5 @@
Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text);
}
}
}
}

View File

@ -0,0 +1,18 @@
namespace UglyToad.PdfPig.Tests.Integration;
using Xunit;
public class Math119FakingDataTests
{
[Fact]
public void CombinesDiaeresisForWords()
{
using var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData.pdf"));
var lastPage = document.GetPage(8);
var words = lastPage.GetWords();
}
}

View File

@ -17,6 +17,7 @@
using Tokenization.Scanner;
using Tokens;
using Operations.TextPositioning;
using Util;
using XObjects;
using static PdfPig.Core.PdfSubpath;
@ -293,15 +294,58 @@
? currentState.CurrentNonStrokingColor
: currentState.CurrentStrokingColor;
var letter = new Letter(unicode, transformedGlyphBounds,
transformedPdfBounds.BottomLeft,
transformedPdfBounds.BottomRight,
transformedPdfBounds.Width,
fontSize,
font.Details,
color,
pointSize,
textSequence);
Letter letter = null;
if (Diacritics.IsInCombiningDiacriticRange(unicode) && letters.Count > 0)
{
var attachTo = letters[letters.Count - 1];
if (attachTo.TextSequence == textSequence
&& Diacritics.TryCombineDiacriticWithPreviousLetter(unicode, attachTo.Value, out var newLetter))
{
// TODO: union of bounding boxes.
letters.Remove(attachTo);
letter = new Letter(
newLetter,
attachTo.GlyphRectangle,
attachTo.StartBaseLine,
attachTo.EndBaseLine,
attachTo.Width,
attachTo.FontSize,
attachTo.Font,
attachTo.Color,
attachTo.PointSize,
attachTo.TextSequence);
}
else
{
letter = new Letter(
unicode,
transformedGlyphBounds,
transformedPdfBounds.BottomLeft,
transformedPdfBounds.BottomRight,
transformedPdfBounds.Width,
fontSize,
font.Details,
color,
pointSize,
textSequence);
}
}
else
{
letter = new Letter(
unicode,
transformedGlyphBounds,
transformedPdfBounds.BottomLeft,
transformedPdfBounds.BottomRight,
transformedPdfBounds.Width,
fontSize,
font.Details,
color,
pointSize,
textSequence);
}
letters.Add(letter);

View File

@ -0,0 +1,75 @@
namespace UglyToad.PdfPig.Util
{
using System;
using System.Collections.Generic;
using System.Globalization;
internal static class Diacritics
{
private static readonly HashSet<string> NonCombiningDiacritics = new HashSet<string>
{
"´",
"^",
"ˆ",
"¨",
"©",
"™",
"®",
"`",
"˜",
"",
"¸"
};
public static bool IsPotentialStandaloneDiacritic(string value) => NonCombiningDiacritics.Contains(value);
public static bool IsInCombiningDiacriticRange(string value)
{
if (value.Length != 1)
{
return false;
}
var intVal = (int)value[0];
if (intVal >= 768 && intVal <= 879)
{
return true;
}
return false;
}
public static bool TryCombineDiacriticWithPreviousLetter(string diacritic, string previous, out string result)
{
result = null;
if (previous == null)
{
return false;
}
result = previous + diacritic;
// On combining the length should remain equal.
var beforeCombination = MeasureDiacriticAwareLength(previous);
var afterCombination = MeasureDiacriticAwareLength(result);
return beforeCombination == afterCombination;
}
private static int MeasureDiacriticAwareLength(string input)
{
var length = 0;
var enumerator = StringInfo.GetTextElementEnumerator(input);
while (enumerator.MoveNext())
{
var grapheme = enumerator.GetTextElement();
length++;
}
return length;
}
}
}