From a2ae1f16d64879fcaa9dece3ef9b937c57c61057 Mon Sep 17 00:00:00 2001
From: Kizaemon <38976573+Kizaemon@users.noreply.github.com>
Date: Sun, 8 Dec 2024 21:28:47 +0900
Subject: [PATCH] New GetText() option: NegativeGapAsWhitespace

When parsing PDF files with tables containing multiple lines in a cell or "merged" cells, the separate words can appear out of horizontal order. This option can better predict the spaces between the words.
---
 .../TextExtractor/ContentOrderTextExtractor.cs       | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
index 80f5f58a..49d8d912 100644
--- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
+++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs
@@ -110,6 +110,11 @@
                     {
                         var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
 
+                        if (options.NegativeGapAsWhitespace)
+                        {
+                            gap = Math.Abs(gap);
+                        }
+
                         if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
                         {
                             sb.Append(" ");
@@ -178,6 +183,13 @@
             /// character. Default <see langword="false"/>.
             /// </summary>
             public bool ReplaceWhitespaceWithSpace { get; set; }
+
+            /// <summary>
+            /// When parsing PDF files with tables containing multiple lines in a cell or "merged" cells,
+            /// the separate words can appear out of horizontal order. This option can better predict the
+            //  spaces between the words. Default <see langword="false"/>.
+            /// </summary>
+            public bool NegativeGapAsWhitespace { get; set; }
         }
     }
 }