handle narrow whitespaces in default text extractor #319

where the gap is small but much larger than all previous gaps at this
font size (and still larger than some minimum threshold) then break
the word at this gap boundary.
This commit is contained in:
Eliot Jones 2021-05-09 12:59:53 -04:00
parent 264cf7b8a8
commit 97e831c321

View File

@ -21,6 +21,8 @@
var lettersSoFar = new List<Letter>(10);
var gapCountsSoFarByFontSize = new Dictionary<double, Dictionary<double, int>>();
var y = default(double?);
var lastX = default(double?);
var lastLetter = default(Letter);
@ -68,15 +70,48 @@
continue;
}
var letterHeight = Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height);
var gap = letter.Location.X - (lastLetter.Location.X + lastLetter.Width);
var nextToLeft = letter.Location.X < lastX.Value - 1;
var nextBigSpace = gap > Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height) * 0.39;
var nextBigSpace = gap > letterHeight * 0.39;
var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value);
var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1;
var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1;
var nextTextOrientationDiffers = letter.TextOrientation != lastLetter.TextOrientation;
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers)
var suspectGap = false;
if (!nextFontSizeDiffers && letter.FontSize > 0 && gap >= 0)
{
var fontSize = Math.Round(letter.FontSize);
if (!gapCountsSoFarByFontSize.TryGetValue(fontSize, out var gapCounts))
{
gapCounts = new Dictionary<double, int>();
gapCountsSoFarByFontSize[fontSize] = gapCounts;
}
var gapRounded = Math.Round(gap, 2);
if (!gapCounts.ContainsKey(gapRounded))
{
gapCounts[gapRounded] = 0;
}
gapCounts[gapRounded]++;
// More than one type of gap.
if (gapCounts.Count > 1 && gap > letterHeight * 0.16)
{
var mostCommonGap = gapCounts.OrderByDescending(x => x.Value).First();
if (gap > (mostCommonGap.Key * 5) && mostCommonGap.Value > 1)
{
suspectGap = true;
}
}
}
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers || suspectGap)
{
if (lettersSoFar.Count > 0)
{