mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
handle narrow whitespaces in default text extractor #319
where the gap is small but much larger than all previous gaps at this font size (and still larger than some minimum threshold) then break the word at this gap boundary.
This commit is contained in:
parent
264cf7b8a8
commit
97e831c321
@ -21,6 +21,8 @@
|
||||
|
||||
var lettersSoFar = new List<Letter>(10);
|
||||
|
||||
var gapCountsSoFarByFontSize = new Dictionary<double, Dictionary<double, int>>();
|
||||
|
||||
var y = default(double?);
|
||||
var lastX = default(double?);
|
||||
var lastLetter = default(Letter);
|
||||
@ -68,15 +70,48 @@
|
||||
continue;
|
||||
}
|
||||
|
||||
var letterHeight = Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height);
|
||||
|
||||
var gap = letter.Location.X - (lastLetter.Location.X + lastLetter.Width);
|
||||
var nextToLeft = letter.Location.X < lastX.Value - 1;
|
||||
var nextBigSpace = gap > Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height) * 0.39;
|
||||
var nextBigSpace = gap > letterHeight * 0.39;
|
||||
var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value);
|
||||
var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1;
|
||||
var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1;
|
||||
var nextTextOrientationDiffers = letter.TextOrientation != lastLetter.TextOrientation;
|
||||
|
||||
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers)
|
||||
var suspectGap = false;
|
||||
|
||||
if (!nextFontSizeDiffers && letter.FontSize > 0 && gap >= 0)
|
||||
{
|
||||
var fontSize = Math.Round(letter.FontSize);
|
||||
if (!gapCountsSoFarByFontSize.TryGetValue(fontSize, out var gapCounts))
|
||||
{
|
||||
gapCounts = new Dictionary<double, int>();
|
||||
gapCountsSoFarByFontSize[fontSize] = gapCounts;
|
||||
}
|
||||
|
||||
var gapRounded = Math.Round(gap, 2);
|
||||
if (!gapCounts.ContainsKey(gapRounded))
|
||||
{
|
||||
gapCounts[gapRounded] = 0;
|
||||
}
|
||||
|
||||
gapCounts[gapRounded]++;
|
||||
|
||||
// More than one type of gap.
|
||||
if (gapCounts.Count > 1 && gap > letterHeight * 0.16)
|
||||
{
|
||||
var mostCommonGap = gapCounts.OrderByDescending(x => x.Value).First();
|
||||
|
||||
if (gap > (mostCommonGap.Key * 5) && mostCommonGap.Value > 1)
|
||||
{
|
||||
suspectGap = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers || suspectGap)
|
||||
{
|
||||
if (lettersSoFar.Count > 0)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user