From a2ae1f16d64879fcaa9dece3ef9b937c57c61057 Mon Sep 17 00:00:00 2001
From: Kizaemon <38976573+Kizaemon@users.noreply.github.com>
Date: Sun, 8 Dec 2024 21:28:47 +0900
Subject: [PATCH] New GetText() option: NegativeGapAsWhitespace
When parsing PDF files with tables containing multiple lines in a cell or "merged" cells, the separate words can appear out of horizontal order. This option can better predict the spaces between the words.
---
.../TextExtractor/ContentOrderTextExtractor.cs | 12 ++++++++++++
1 file changed, 12 insertions(+)
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
index 80f5f58a..49d8d912 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
@@ -110,6 +110,11 @@
{
var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
+ if (options.NegativeGapAsWhitespace)
+ {
+ gap = Math.Abs(gap);
+ }
+
if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
{
sb.Append(" ");
@@ -178,6 +183,13 @@
/// character. Default .
///
public bool ReplaceWhitespaceWithSpace { get; set; }
+
+ ///
+ /// When parsing PDF files with tables containing multiple lines in a cell or "merged" cells,
+ /// the separate words can appear out of horizontal order. This option can better predict the
+ // spaces between the words. Default .
+ ///
+ public bool NegativeGapAsWhitespace { get; set; }
}
}
}