mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
add content order text extractor and example of use
This commit is contained in:
parent
f18bc0766a
commit
407ee5ca51
22
examples/ExtractTextWithNewlines.cs
Normal file
22
examples/ExtractTextWithNewlines.cs
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
namespace UglyToad.Examples
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using PdfPig;
|
||||||
|
using PdfPig.DocumentLayoutAnalysis.TextExtractor;
|
||||||
|
|
||||||
|
internal static class ExtractTextWithNewlines
|
||||||
|
{
|
||||||
|
public static void Run(string filePath)
|
||||||
|
{
|
||||||
|
using (var document = PdfDocument.Open(filePath))
|
||||||
|
{
|
||||||
|
foreach (var page in document.GetPages())
|
||||||
|
{
|
||||||
|
var text = ContentOrderTextExtractor.GetText(page, true);
|
||||||
|
|
||||||
|
Console.WriteLine(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -17,12 +17,16 @@
|
|||||||
var examples = new Dictionary<int, (string name, Action action)>
|
var examples = new Dictionary<int, (string name, Action action)>
|
||||||
{
|
{
|
||||||
{1,
|
{1,
|
||||||
("Extract Words with newline detection",
|
("Extract Words with newline detection (example with algorithm)",
|
||||||
() => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "Two Page Text Only - from libre office.pdf")))
|
() => OpenDocumentAndExtractWords.Run(Path.Combine(filesDirectory, "Two Page Text Only - from libre office.pdf")))
|
||||||
},
|
},
|
||||||
{2,
|
{2,
|
||||||
("Extract images",
|
("Extract images",
|
||||||
() => ExtractImages.Run(Path.Combine(filesDirectory, "2006_Swedish_Touring_Car_Championship.pdf")))
|
() => ExtractImages.Run(Path.Combine(filesDirectory, "2006_Swedish_Touring_Car_Championship.pdf")))
|
||||||
|
},
|
||||||
|
{3,
|
||||||
|
("Extract Text with newlines (using built-in content extractor)",
|
||||||
|
() => ExtractTextWithNewlines.Run(Path.Combine(filesDirectory, "Two Page Text Only - from libre office.pdf")))
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\src\UglyToad.PdfPig.DocumentLayoutAnalysis\UglyToad.PdfPig.DocumentLayoutAnalysis.csproj" />
|
||||||
<ProjectReference Include="..\src\UglyToad.PdfPig\UglyToad.PdfPig.csproj" />
|
<ProjectReference Include="..\src\UglyToad.PdfPig\UglyToad.PdfPig.csproj" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
@ -15,6 +15,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UglyToad.PdfPig.Tokens", ".
|
|||||||
EndProject
|
EndProject
|
||||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UglyToad.PdfPig", "..\src\UglyToad.PdfPig\UglyToad.PdfPig.csproj", "{75ED54D6-308F-44AD-B85E-C027F3AA80AE}"
|
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UglyToad.PdfPig", "..\src\UglyToad.PdfPig\UglyToad.PdfPig.csproj", "{75ED54D6-308F-44AD-B85E-C027F3AA80AE}"
|
||||||
EndProject
|
EndProject
|
||||||
|
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UglyToad.PdfPig.DocumentLayoutAnalysis", "..\src\UglyToad.PdfPig.DocumentLayoutAnalysis\UglyToad.PdfPig.DocumentLayoutAnalysis.csproj", "{70FEC330-CF3F-4815-9BA6-E622907086C9}"
|
||||||
|
EndProject
|
||||||
Global
|
Global
|
||||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||||
Debug|Any CPU = Debug|Any CPU
|
Debug|Any CPU = Debug|Any CPU
|
||||||
@ -45,6 +47,10 @@ Global
|
|||||||
{75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
{75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
{75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
{75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
{75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Release|Any CPU.Build.0 = Release|Any CPU
|
{75ED54D6-308F-44AD-B85E-C027F3AA80AE}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
|
{70FEC330-CF3F-4815-9BA6-E622907086C9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||||
|
{70FEC330-CF3F-4815-9BA6-E622907086C9}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||||
|
{70FEC330-CF3F-4815-9BA6-E622907086C9}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||||
|
{70FEC330-CF3F-4815-9BA6-E622907086C9}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||||
EndGlobalSection
|
EndGlobalSection
|
||||||
GlobalSection(SolutionProperties) = preSolution
|
GlobalSection(SolutionProperties) = preSolution
|
||||||
HideSolutionNode = FALSE
|
HideSolutionNode = FALSE
|
||||||
|
@ -0,0 +1,124 @@
|
|||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Text;
|
||||||
|
using Content;
|
||||||
|
using Util;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Extracts text from a document based on the content order in the file.
|
||||||
|
/// </summary>
|
||||||
|
public static class ContentOrderTextExtractor
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Gets a human readable representation of the text from the page based on
|
||||||
|
/// the letter order of the original PDF document.
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="page">A page from the document.</param>
|
||||||
|
/// <param name="addDoubleNewline">Whether to include a double new-line when the text is likely to be a new paragraph.</param>
|
||||||
|
public static string GetText(Page page, bool addDoubleNewline = false)
|
||||||
|
{
|
||||||
|
var sb = new StringBuilder();
|
||||||
|
|
||||||
|
var previous = default(Letter);
|
||||||
|
var hasJustAddedWhitespace = false;
|
||||||
|
for (var i = 0; i < page.Letters.Count; i++)
|
||||||
|
{
|
||||||
|
var letter = page.Letters[i];
|
||||||
|
|
||||||
|
if (string.IsNullOrEmpty(letter.Value))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (letter.Value == " " && !hasJustAddedWhitespace)
|
||||||
|
{
|
||||||
|
if (previous != null && IsNewline(previous, letter, page, out _))
|
||||||
|
{
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.Append(" ");
|
||||||
|
previous = letter;
|
||||||
|
hasJustAddedWhitespace = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
hasJustAddedWhitespace = false;
|
||||||
|
|
||||||
|
if (previous != null && letter.Value != " ")
|
||||||
|
{
|
||||||
|
var nwPrevious = GetNonWhitespacePrevious(page, i);
|
||||||
|
|
||||||
|
if (IsNewline(nwPrevious, letter, page, out var isDoubleNewline))
|
||||||
|
{
|
||||||
|
if (previous.Value == " ")
|
||||||
|
{
|
||||||
|
sb.Remove(sb.Length - 1, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.AppendLine();
|
||||||
|
if (addDoubleNewline && isDoubleNewline)
|
||||||
|
{
|
||||||
|
sb.AppendLine();
|
||||||
|
}
|
||||||
|
|
||||||
|
hasJustAddedWhitespace = true;
|
||||||
|
}
|
||||||
|
else if (previous.Value != " ")
|
||||||
|
{
|
||||||
|
var gap = letter.StartBaseLine.X - previous.EndBaseLine.X;
|
||||||
|
|
||||||
|
if (WhitespaceSizeStatistics.IsProbablyWhitespace(gap, previous))
|
||||||
|
{
|
||||||
|
sb.Append(" ");
|
||||||
|
hasJustAddedWhitespace = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.Append(letter.Value);
|
||||||
|
previous = letter;
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.ToString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Letter GetNonWhitespacePrevious(Page page, int index)
|
||||||
|
{
|
||||||
|
for (var i = index - 1; i >= 0; i--)
|
||||||
|
{
|
||||||
|
var letter = page.Letters[i];
|
||||||
|
if (!string.IsNullOrWhiteSpace(letter.Value))
|
||||||
|
{
|
||||||
|
return letter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static bool IsNewline(Letter previous, Letter letter, Page page, out bool isDoubleNewline)
|
||||||
|
{
|
||||||
|
isDoubleNewline = false;
|
||||||
|
|
||||||
|
if (previous == null)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var ptSizePrevious = (int)Math.Round(page.ExperimentalAccess.GetPointSize(previous));
|
||||||
|
var ptSize = (int)Math.Round(page.ExperimentalAccess.GetPointSize(letter));
|
||||||
|
var minPtSize = ptSize < ptSizePrevious ? ptSize : ptSizePrevious;
|
||||||
|
|
||||||
|
var gap = Math.Abs(previous.StartBaseLine.Y - letter.StartBaseLine.Y);
|
||||||
|
|
||||||
|
if (gap > minPtSize * 1.7 && previous.StartBaseLine.Y > letter.StartBaseLine.Y)
|
||||||
|
{
|
||||||
|
isDoubleNewline = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return gap > minPtSize * 0.9;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -15,7 +15,7 @@
|
|||||||
{
|
{
|
||||||
private readonly IReadOnlyList<uint> glyphOffsets;
|
private readonly IReadOnlyList<uint> glyphOffsets;
|
||||||
private readonly PdfRectangle maxGlyphBounds;
|
private readonly PdfRectangle maxGlyphBounds;
|
||||||
private readonly TrueTypeDataBytes tableBytes;
|
private TrueTypeDataBytes tableBytes;
|
||||||
|
|
||||||
/// <inheritdoc />
|
/// <inheritdoc />
|
||||||
public string Tag => TrueTypeHeaderTable.Glyf;
|
public string Tag => TrueTypeHeaderTable.Glyf;
|
||||||
@ -98,6 +98,11 @@
|
|||||||
|
|
||||||
private IReadOnlyList<IGlyphDescription> ReadGlyphs()
|
private IReadOnlyList<IGlyphDescription> ReadGlyphs()
|
||||||
{
|
{
|
||||||
|
if (tableBytes == null)
|
||||||
|
{
|
||||||
|
throw new InvalidOperationException("Bytes cache was discarded before lazy value evaluated.");
|
||||||
|
}
|
||||||
|
|
||||||
var data = tableBytes;
|
var data = tableBytes;
|
||||||
|
|
||||||
var offsets = glyphOffsets;
|
var offsets = glyphOffsets;
|
||||||
@ -149,6 +154,8 @@
|
|||||||
result[compositeLocation.Key] = ReadCompositeGlyph(data, compositeLocation.Value, compositeLocations, result, emptyGlyph);
|
result[compositeLocation.Key] = ReadCompositeGlyph(data, compositeLocation.Value, compositeLocations, result, emptyGlyph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tableBytes = null;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
namespace UglyToad.PdfPig.Tests.Integration
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
|
||||||
|
|
||||||
|
namespace UglyToad.PdfPig.Tests.Integration
|
||||||
{
|
{
|
||||||
using System;
|
using System;
|
||||||
using System.Collections.Generic;
|
using System.Collections.Generic;
|
||||||
@ -159,6 +161,19 @@ used per estimate, we introduce a “complement class”
Naive Bayes is often us
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void CanExtractContentOrderText()
|
||||||
|
{
|
||||||
|
using (var document = PdfDocument.Open(GetFilename()))
|
||||||
|
{
|
||||||
|
foreach (var page in document.GetPages())
|
||||||
|
{
|
||||||
|
var text = ContentOrderTextExtractor.GetText(page);
|
||||||
|
Assert.NotNull(text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
|
private static IReadOnlyList<AssertablePositionData> GetPdfBoxPositionData()
|
||||||
{
|
{
|
||||||
var path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Integration", "Documents", "ICML03-081.Page1.Positions.txt");
|
var path = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Integration", "Documents", "ICML03-081.Page1.Positions.txt");
|
||||||
|
20
src/UglyToad.PdfPig/Util/WhitespaceSizeStatistics.cs
Normal file
20
src/UglyToad.PdfPig/Util/WhitespaceSizeStatistics.cs
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
namespace UglyToad.PdfPig.Util
|
||||||
|
{
|
||||||
|
using Content;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Measures of whitespace size based on point size.
|
||||||
|
/// </summary>
|
||||||
|
public static class WhitespaceSizeStatistics
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Get the average whitespace sized expected for a given letter.
|
||||||
|
/// </summary>
|
||||||
|
public static double GetExpectedWhitespaceSize(Letter letter) => letter.PointSize * 0.27;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Check if the measured gap is probably big enough to be a whitespace character based on the letter.
|
||||||
|
/// </summary>
|
||||||
|
public static bool IsProbablyWhitespace(double gap, Letter letter) => gap > (GetExpectedWhitespaceSize(letter) - (letter.PointSize * 0.05));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user