mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
Merge pull request #440 from UglyToad/diacritics-stuff
add handling for combining diacritics #439
This commit is contained in:
commit
49ce5c7eb7
@ -42,6 +42,5 @@
|
|||||||
Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text);
|
Assert.Contains("financial results for the fiscal quarter ended June 30, 2017 and (2) a conference call to discuss those results and Farmer Mac", page.Text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -0,0 +1,18 @@
|
|||||||
|
namespace UglyToad.PdfPig.Tests.Integration;
|
||||||
|
|
||||||
|
using Xunit;
|
||||||
|
|
||||||
|
public class Math119FakingDataTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void CombinesDiaeresisForWords()
|
||||||
|
{
|
||||||
|
using var document = PdfDocument.Open(IntegrationHelpers.GetDocumentPath("Math119FakingData.pdf"));
|
||||||
|
|
||||||
|
var lastPage = document.GetPage(8);
|
||||||
|
|
||||||
|
var words = lastPage.GetWords();
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -17,6 +17,7 @@
|
|||||||
using Tokenization.Scanner;
|
using Tokenization.Scanner;
|
||||||
using Tokens;
|
using Tokens;
|
||||||
using Operations.TextPositioning;
|
using Operations.TextPositioning;
|
||||||
|
using Util;
|
||||||
using XObjects;
|
using XObjects;
|
||||||
using static PdfPig.Core.PdfSubpath;
|
using static PdfPig.Core.PdfSubpath;
|
||||||
|
|
||||||
@ -293,15 +294,58 @@
|
|||||||
? currentState.CurrentNonStrokingColor
|
? currentState.CurrentNonStrokingColor
|
||||||
: currentState.CurrentStrokingColor;
|
: currentState.CurrentStrokingColor;
|
||||||
|
|
||||||
var letter = new Letter(unicode, transformedGlyphBounds,
|
Letter letter = null;
|
||||||
transformedPdfBounds.BottomLeft,
|
if (Diacritics.IsInCombiningDiacriticRange(unicode) && bytes.CurrentOffset > 0 && letters.Count > 0)
|
||||||
transformedPdfBounds.BottomRight,
|
{
|
||||||
transformedPdfBounds.Width,
|
var attachTo = letters[letters.Count - 1];
|
||||||
fontSize,
|
|
||||||
font.Details,
|
if (attachTo.TextSequence == textSequence
|
||||||
color,
|
&& Diacritics.TryCombineDiacriticWithPreviousLetter(unicode, attachTo.Value, out var newLetter))
|
||||||
pointSize,
|
{
|
||||||
textSequence);
|
// TODO: union of bounding boxes.
|
||||||
|
letters.Remove(attachTo);
|
||||||
|
|
||||||
|
letter = new Letter(
|
||||||
|
newLetter,
|
||||||
|
attachTo.GlyphRectangle,
|
||||||
|
attachTo.StartBaseLine,
|
||||||
|
attachTo.EndBaseLine,
|
||||||
|
attachTo.Width,
|
||||||
|
attachTo.FontSize,
|
||||||
|
attachTo.Font,
|
||||||
|
attachTo.Color,
|
||||||
|
attachTo.PointSize,
|
||||||
|
attachTo.TextSequence);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
letter = new Letter(
|
||||||
|
unicode,
|
||||||
|
transformedGlyphBounds,
|
||||||
|
transformedPdfBounds.BottomLeft,
|
||||||
|
transformedPdfBounds.BottomRight,
|
||||||
|
transformedPdfBounds.Width,
|
||||||
|
fontSize,
|
||||||
|
font.Details,
|
||||||
|
color,
|
||||||
|
pointSize,
|
||||||
|
textSequence);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
letter = new Letter(
|
||||||
|
unicode,
|
||||||
|
transformedGlyphBounds,
|
||||||
|
transformedPdfBounds.BottomLeft,
|
||||||
|
transformedPdfBounds.BottomRight,
|
||||||
|
transformedPdfBounds.Width,
|
||||||
|
fontSize,
|
||||||
|
font.Details,
|
||||||
|
color,
|
||||||
|
pointSize,
|
||||||
|
textSequence);
|
||||||
|
}
|
||||||
|
|
||||||
letters.Add(letter);
|
letters.Add(letter);
|
||||||
|
|
||||||
|
75
src/UglyToad.PdfPig/Util/Diacritics.cs
Normal file
75
src/UglyToad.PdfPig/Util/Diacritics.cs
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
namespace UglyToad.PdfPig.Util
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Globalization;
|
||||||
|
|
||||||
|
internal static class Diacritics
|
||||||
|
{
|
||||||
|
private static readonly HashSet<string> NonCombiningDiacritics = new HashSet<string>
|
||||||
|
{
|
||||||
|
"´",
|
||||||
|
"^",
|
||||||
|
"ˆ",
|
||||||
|
"¨",
|
||||||
|
"©",
|
||||||
|
"™",
|
||||||
|
"®",
|
||||||
|
"`",
|
||||||
|
"˜",
|
||||||
|
"∼",
|
||||||
|
"¸"
|
||||||
|
};
|
||||||
|
|
||||||
|
public static bool IsPotentialStandaloneDiacritic(string value) => NonCombiningDiacritics.Contains(value);
|
||||||
|
|
||||||
|
public static bool IsInCombiningDiacriticRange(string value)
|
||||||
|
{
|
||||||
|
if (value.Length != 1)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var intVal = (int)value[0];
|
||||||
|
|
||||||
|
if (intVal >= 768 && intVal <= 879)
|
||||||
|
{
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static bool TryCombineDiacriticWithPreviousLetter(string diacritic, string previous, out string result)
|
||||||
|
{
|
||||||
|
result = null;
|
||||||
|
|
||||||
|
if (previous == null)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
result = previous + diacritic;
|
||||||
|
|
||||||
|
// On combining the length should remain equal.
|
||||||
|
var beforeCombination = MeasureDiacriticAwareLength(previous);
|
||||||
|
var afterCombination = MeasureDiacriticAwareLength(result);
|
||||||
|
|
||||||
|
return beforeCombination == afterCombination;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int MeasureDiacriticAwareLength(string input)
|
||||||
|
{
|
||||||
|
var length = 0;
|
||||||
|
|
||||||
|
var enumerator = StringInfo.GetTextElementEnumerator(input);
|
||||||
|
while (enumerator.MoveNext())
|
||||||
|
{
|
||||||
|
var grapheme = enumerator.GetTextElement();
|
||||||
|
length++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user