New GetText() option: NegativeGapAsWhitespace

When parsing PDF files with tables containing multiple lines in a cell or "merged" cells, the separate words can appear out of horizontal order. This option can better predict the spaces between the words.
This commit is contained in:
Kizaemon 2024-12-08 21:28:47 +09:00 committed by BobLd
parent 97ae62ce6f
commit a2ae1f16d6

View File

@ -110,6 +110,11 @@
{
var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
if (options.NegativeGapAsWhitespace)
{
gap = Math.Abs(gap);
}
if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
{
sb.Append(" ");
@ -178,6 +183,13 @@
/// character. Default <see langword="false"/>.
/// </summary>
public bool ReplaceWhitespaceWithSpace { get; set; }
/// <summary>
/// When parsing PDF files with tables containing multiple lines in a cell or "merged" cells,
/// the separate words can appear out of horizontal order. This option can better predict the
// spaces between the words. Default <see langword="false"/>.
/// </summary>
public bool NegativeGapAsWhitespace { get; set; }
}
}
}