PdfPig/examples/AdvancedTextExtraction.cs

namespace UglyToad.Examples
{
    using PdfPig;
    using System;
    using System.Text;
    using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
    using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
    using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;

    public static class AdvancedTextExtraction
    {
        public static void Run(string filePath)
        {
#if YET_TO_BE_DONE
            var sb = new StringBuilder();

            using (var document = PdfDocument.Open(filePath))
            {
                foreach (var page in document.GetPages())
                {
                    // 0. Preprocessing
                    var letters = page.Letters; // no preprocessing

                    // 1. Extract words
                    var wordExtractor = NearestNeighbourWordExtractor.Instance;
                    var wordExtractorOptions = new NearestNeighbourWordExtractor.NearestNeighbourWordExtractorOptions()
                    {
                        Filter = (pivot, candidate) =>
                        {
                            // check if white space (default implementation of 'Filter')
                            if (string.IsNullOrWhiteSpace(candidate.Value))
                            {
                                // pivot and candidate letters cannot belong to the same word
                                // if candidate letter is null or white space.
                                // ('FilterPivot' already checks if the pivot is null or white space by default)
                                return false;
                            }

                            // check for height difference
                            var maxHeight = Math.Max(pivot.PointSize, candidate.PointSize);
                            var minHeight = Math.Min(pivot.PointSize, candidate.PointSize);
                            if (minHeight != 0 && maxHeight / minHeight > 2.0)
                            {
                                // pivot and candidate letters cannot belong to the same word
                                // if one letter is more than twice the size of the other.
                                return false;
                            }

                            // check for colour difference
                            var pivotRgb = pivot.Color.ToRGBValues();
                            var candidateRgb = candidate.Color.ToRGBValues();
                            if (!pivotRgb.Equals(candidateRgb))
                            {
                                // pivot and candidate letters cannot belong to the same word
                                // if they don't have the same colour.
                                return false;
                            }

                            return true;
                        }
                    };

                    var words = wordExtractor.GetWords(letters, wordExtractorOptions);

                    // 2. Segment page
                    var pageSegmenter = DocstrumBoundingBoxes.Instance;
                    var pageSegmenterOptions = new DocstrumBoundingBoxes.DocstrumBoundingBoxesOptions()
                    {

                    };

                    var textBlocks = pageSegmenter.GetBlocks(words, pageSegmenterOptions);

                    // 3. Postprocessing
                    var readingOrder = UnsupervisedReadingOrderDetector.Instance;
                    var orderedTextBlocks = readingOrder.Get(textBlocks);

                    // 4. Extract text
                    foreach (var block in orderedTextBlocks)
                    {
                        sb.Append(block.Text.Normalize(NormalizationForm.FormKC)); // normalise text
                        sb.AppendLine();
                    }

                    sb.AppendLine();
                }
            }

            Console.WriteLine(sb.ToString());
#endif
        }
    }
}