PdfPig/examples/OpenDocumentAndExtractWords.cs
2020-04-18 18:46:26 +01:00

56 lines
1.8 KiB
C#

namespace UglyToad.Examples
{
using System;
using System.Text;
using PdfPig;
using PdfPig.Content;
public static class OpenDocumentAndExtractWords
{
public static void Run(string filePath)
{
var sb = new StringBuilder();
using (var document = PdfDocument.Open(filePath))
{
Word previous = null;
foreach (var page in document.GetPages())
{
foreach (var word in page.GetWords())
{
if (previous != null)
{
var hasInsertedWhitespace = false;
var bothNonEmpty = previous.Letters.Count > 0 && word.Letters.Count > 0;
if (bothNonEmpty)
{
var prevLetter1 = previous.Letters[0];
var currentLetter1 = word.Letters[0];
var baselineGap = Math.Abs(prevLetter1.StartBaseLine.Y - currentLetter1.StartBaseLine.Y);
if (baselineGap > 3)
{
hasInsertedWhitespace = true;
sb.AppendLine();
}
}
if (!hasInsertedWhitespace)
{
sb.Append(" ");
}
}
sb.Append(word.Text);
previous = word;
}
}
}
Console.WriteLine(sb.ToString());
}
}
}