diff --git a/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs b/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs index fcc316b8..be663e3b 100644 --- a/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs +++ b/src/UglyToad.PdfPig/Util/DefaultWordExtractor.cs @@ -21,6 +21,8 @@ var lettersSoFar = new List(10); + var gapCountsSoFarByFontSize = new Dictionary>(); + var y = default(double?); var lastX = default(double?); var lastLetter = default(Letter); @@ -68,15 +70,48 @@ continue; } + var letterHeight = Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height); + var gap = letter.Location.X - (lastLetter.Location.X + lastLetter.Width); var nextToLeft = letter.Location.X < lastX.Value - 1; - var nextBigSpace = gap > Math.Max(lastLetter.GlyphRectangle.Height, letter.GlyphRectangle.Height) * 0.39; + var nextBigSpace = gap > letterHeight * 0.39; var nextIsWhiteSpace = string.IsNullOrWhiteSpace(letter.Value); var nextFontDiffers = !string.Equals(letter.FontName, lastLetter.FontName, StringComparison.OrdinalIgnoreCase) && gap > letter.Width * 0.1; var nextFontSizeDiffers = Math.Abs(letter.FontSize - lastLetter.FontSize) > 0.1; var nextTextOrientationDiffers = letter.TextOrientation != lastLetter.TextOrientation; - if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers) + var suspectGap = false; + + if (!nextFontSizeDiffers && letter.FontSize > 0 && gap >= 0) + { + var fontSize = Math.Round(letter.FontSize); + if (!gapCountsSoFarByFontSize.TryGetValue(fontSize, out var gapCounts)) + { + gapCounts = new Dictionary(); + gapCountsSoFarByFontSize[fontSize] = gapCounts; + } + + var gapRounded = Math.Round(gap, 2); + if (!gapCounts.ContainsKey(gapRounded)) + { + gapCounts[gapRounded] = 0; + } + + gapCounts[gapRounded]++; + + // More than one type of gap. + if (gapCounts.Count > 1 && gap > letterHeight * 0.16) + { + var mostCommonGap = gapCounts.OrderByDescending(x => x.Value).First(); + + if (gap > (mostCommonGap.Key * 5) && mostCommonGap.Value > 1) + { + suspectGap = true; + } + } + } + + if (nextToLeft || nextBigSpace || nextIsWhiteSpace || nextFontDiffers || nextFontSizeDiffers || nextTextOrientationDiffers || suspectGap) { if (lettersSoFar.Count > 0) {