diff --git a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs index 80f5f58a..49d8d912 100644 --- a/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs +++ b/src/UglyToad.PdfPig.DocumentLayoutAnalysis/TextExtractor/ContentOrderTextExtractor.cs @@ -110,6 +110,11 @@ { var gap = letter.StartBaseLine.X - previous.EndBaseLine.X; + if (options.NegativeGapAsWhitespace) + { + gap = Math.Abs(gap); + } + if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous)) { sb.Append(" "); @@ -178,6 +183,13 @@ /// character. Default . /// public bool ReplaceWhitespaceWithSpace { get; set; } + + /// + /// When parsing PDF files with tables containing multiple lines in a cell or "merged" cells, + /// the separate words can appear out of horizontal order. This option can better predict the + // spaces between the words. Default . + /// + public bool NegativeGapAsWhitespace { get; set; } } } }