mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
Merge pull request #440 from UglyToad/diacritics-stuff
add handling for combining diacritics #439
This commit is contained in:
commit
49ce5c7eb7
@ -42,6 +42,5 @@
|
||||
Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
namespace UglyToad.PdfPig.Tests.Integration;
|
||||
|
||||
using Xunit;
|
||||
|
||||
public class Math119FakingDataTests
|
||||
{
|
||||
[Fact]
|
||||
public void CombinesDiaeresisForWords()
|
||||
{
|
||||
using var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData.pdf"));
|
||||
|
||||
var lastPage = document.GetPage(8);
|
||||
|
||||
var words = lastPage.GetWords();
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -17,6 +17,7 @@
|
||||
using Tokenization.Scanner;
|
||||
using Tokens;
|
||||
using Operations.TextPositioning;
|
||||
using Util;
|
||||
using XObjects;
|
||||
using static PdfPig.Core.PdfSubpath;
|
||||
|
||||
@ -293,15 +294,58 @@
|
||||
? currentState.CurrentNonStrokingColor
|
||||
: currentState.CurrentStrokingColor;
|
||||
|
||||
var letter = new Letter(unicode, transformedGlyphBounds,
|
||||
transformedPdfBounds.BottomLeft,
|
||||
transformedPdfBounds.BottomRight,
|
||||
transformedPdfBounds.Width,
|
||||
fontSize,
|
||||
font.Details,
|
||||
color,
|
||||
pointSize,
|
||||
textSequence);
|
||||
Letter letter = null;
|
||||
if (Diacritics.IsInCombiningDiacriticRange(unicode) && bytes.CurrentOffset > 0 && letters.Count > 0)
|
||||
{
|
||||
var attachTo = letters[letters.Count - 1];
|
||||
|
||||
if (attachTo.TextSequence == textSequence
|
||||
&& Diacritics.TryCombineDiacriticWithPreviousLetter(unicode, attachTo.Value, out var newLetter))
|
||||
{
|
||||
// TODO: union of bounding boxes.
|
||||
letters.Remove(attachTo);
|
||||
|
||||
letter = new Letter(
|
||||
newLetter,
|
||||
attachTo.GlyphRectangle,
|
||||
attachTo.StartBaseLine,
|
||||
attachTo.EndBaseLine,
|
||||
attachTo.Width,
|
||||
attachTo.FontSize,
|
||||
attachTo.Font,
|
||||
attachTo.Color,
|
||||
attachTo.PointSize,
|
||||
attachTo.TextSequence);
|
||||
}
|
||||
else
|
||||
{
|
||||
letter = new Letter(
|
||||
unicode,
|
||||
transformedGlyphBounds,
|
||||
transformedPdfBounds.BottomLeft,
|
||||
transformedPdfBounds.BottomRight,
|
||||
transformedPdfBounds.Width,
|
||||
fontSize,
|
||||
font.Details,
|
||||
color,
|
||||
pointSize,
|
||||
textSequence);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
letter = new Letter(
|
||||
unicode,
|
||||
transformedGlyphBounds,
|
||||
transformedPdfBounds.BottomLeft,
|
||||
transformedPdfBounds.BottomRight,
|
||||
transformedPdfBounds.Width,
|
||||
fontSize,
|
||||
font.Details,
|
||||
color,
|
||||
pointSize,
|
||||
textSequence);
|
||||
}
|
||||
|
||||
letters.Add(letter);
|
||||
|
||||
|
75
src/UglyToad.PdfPig/Util/Diacritics.cs
Normal file
75
src/UglyToad.PdfPig/Util/Diacritics.cs
Normal file
@ -0,0 +1,75 @@
|
||||
namespace UglyToad.PdfPig.Util
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Globalization;
|
||||
|
||||
internal static class Diacritics
|
||||
{
|
||||
private static readonly HashSet<string> NonCombiningDiacritics = new HashSet<string>
|
||||
{
|
||||
"´",
|
||||
"^",
|
||||
"ˆ",
|
||||
"¨",
|
||||
"©",
|
||||
"™",
|
||||
"®",
|
||||
"`",
|
||||
"˜",
|
||||
"∼",
|
||||
"¸"
|
||||
};
|
||||
|
||||
public static bool IsPotentialStandaloneDiacritic(string value) => NonCombiningDiacritics.Contains(value);
|
||||
|
||||
public static bool IsInCombiningDiacriticRange(string value)
|
||||
{
|
||||
if (value.Length != 1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
var intVal = (int)value[0];
|
||||
|
||||
if (intVal >= 768 && intVal <= 879)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public static bool TryCombineDiacriticWithPreviousLetter(string diacritic, string previous, out string result)
|
||||
{
|
||||
result = null;
|
||||
|
||||
if (previous == null)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
result = previous + diacritic;
|
||||
|
||||
// On combining the length should remain equal.
|
||||
var beforeCombination = MeasureDiacriticAwareLength(previous);
|
||||
var afterCombination = MeasureDiacriticAwareLength(result);
|
||||
|
||||
return beforeCombination == afterCombination;
|
||||
}
|
||||
|
||||
private static int MeasureDiacriticAwareLength(string input)
|
||||
{
|
||||
var length = 0;
|
||||
|
||||
var enumerator = StringInfo.GetTextElementEnumerator(input);
|
||||
while (enumerator.MoveNext())
|
||||
{
|
||||
var grapheme = enumerator.GetTextElement();
|
||||
length++;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user