From a2ae1f16d64879fcaa9dece3ef9b937c57c61057 Mon Sep 17 00:00:00 2001 From: Kizaemon <38976573+Kizaemon@users.noreply.github.com> Date: Sun, 8 Dec 2024 21:28:47 +0900 Subject: [PATCH] New GetText() option: NegativeGapAsWhitespace When parsing PDF files with tables containing multiple lines in a cell or "merged" cells, the separate words can appear out of horizontal order. This option can better predict the spaces between the words. --- .../TextExtractor/ContentOrderTextExtractor.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs index 80f5f58a..49d8d912 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs @@ -110,6 +110,11 @@ { var gap = letter.StartBaseLine.X - previous.EndBaseLine.X; + if (options.NegativeGapAsWhitespace) + { + gap = Math.Abs(gap); + } + if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous)) { sb.Append(" "); @@ -178,6 +183,13 @@ /// character. Default . /// public bool ReplaceWhitespaceWithSpace { get; set; } + + /// + /// When parsing PDF files with tables containing multiple lines in a cell or "merged" cells, + /// the separate words can appear out of horizontal order. This option can better predict the + // spaces between the words. Default . + /// + public bool NegativeGapAsWhitespace { get; set; } } } }