mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
* #836 Fix UnsupervisedReadingOrder orders 2 blocks on the same row out of order Add images for documentation * Update Documentation: Additional example, Reference to wiki * Change code formating to C# on documentation * Fix link in documentation * Fix Spelling --------- Co-authored-by: David <David@david>
This commit is contained in:
parent
d86c2f44f0
commit
d7e434edaa
222
README.md
222
README.md
@ -12,6 +12,9 @@ This project aims to port [PDFBox](https://github.com/apache/pdfbox) to C#.
|
|||||||
|
|
||||||
**Migrating to 0.1.6 from 0.1.x?** Use this guide: [migration to 0.1.6](https://github.com/UglyToad/PdfPig/wiki/Migration-to-0.1.6).
|
**Migrating to 0.1.6 from 0.1.x?** Use this guide: [migration to 0.1.6](https://github.com/UglyToad/PdfPig/wiki/Migration-to-0.1.6).
|
||||||
|
|
||||||
|
## Wiki
|
||||||
|
Check out our [wiki](https://github.com/UglyToad/PdfPig/wiki) for more examples and detailed guides on the API.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
The package is available via the releases tab or from Nuget:
|
The package is available via the releases tab or from Nuget:
|
||||||
@ -26,20 +29,25 @@ While the version is below 1.0.0 minor versions will change the public API witho
|
|||||||
|
|
||||||
## Get Started
|
## Get Started
|
||||||
|
|
||||||
|
See the [wiki](https://github.com/UglyToad/PdfPig/wiki) for more examples
|
||||||
|
|
||||||
|
### Read words in a page
|
||||||
The simplest usage at this stage is to open a document, reading the words from every page:
|
The simplest usage at this stage is to open a document, reading the words from every page:
|
||||||
|
|
||||||
using (PdfDocument document = PdfDocument.Open(@"C:\Documents\document.pdf"))
|
```cs
|
||||||
{
|
using (PdfDocument document = PdfDocument.Open(@"C:\Documents\document.pdf"))
|
||||||
foreach (Page page in document.GetPages())
|
{
|
||||||
{
|
foreach (Page page in document.GetPages())
|
||||||
string pageText = page.Text;
|
{
|
||||||
|
string pageText = page.Text;
|
||||||
|
|
||||||
foreach (Word word in page.GetWords())
|
foreach (Word word in page.GetWords())
|
||||||
{
|
{
|
||||||
Console.WriteLine(word.Text);
|
Console.WriteLine(word.Text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
```
|
||||||
|
|
||||||
An example of the output of this is shown below:
|
An example of the output of this is shown below:
|
||||||
|
|
||||||
@ -47,20 +55,23 @@ An example of the output of this is shown below:
|
|||||||
|
|
||||||
Where for the PDF text ("Write something in") shown at the top the 3 words (in pink) are detected and each word contains the individual letters with glyph bounding boxes.
|
Where for the PDF text ("Write something in") shown at the top the 3 words (in pink) are detected and each word contains the individual letters with glyph bounding boxes.
|
||||||
|
|
||||||
|
### Ceate PDF Document
|
||||||
To create documents use the class `PdfDocumentBuilder`. The Standard 14 fonts provide a quick way to get started:
|
To create documents use the class `PdfDocumentBuilder`. The Standard 14 fonts provide a quick way to get started:
|
||||||
|
|
||||||
PdfDocumentBuilder builder = new PdfDocumentBuilder();
|
```cs
|
||||||
|
PdfDocumentBuilder builder = new PdfDocumentBuilder();
|
||||||
|
|
||||||
PdfPageBuilder page = builder.AddPage(PageSize.A4);
|
PdfPageBuilder page = builder.AddPage(PageSize.A4);
|
||||||
|
|
||||||
// Fonts must be registered with the document builder prior to use to prevent duplication.
|
// Fonts must be registered with the document builder prior to use to prevent duplication.
|
||||||
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||||
|
|
||||||
page.AddText("Hello World!", 12, new PdfPoint(25, 700), font);
|
page.AddText("Hello World!", 12, new PdfPoint(25, 700), font);
|
||||||
|
|
||||||
byte[] documentBytes = builder.Build();
|
byte[] documentBytes = builder.Build();
|
||||||
|
|
||||||
File.WriteAllBytes(@"C:\git\newPdf.pdf", documentBytes);
|
File.WriteAllBytes(@"C:\git\newPdf.pdf", documentBytes);
|
||||||
|
```
|
||||||
|
|
||||||
The output is a 1 page PDF document with the text "Hello World!" in Helvetica near the top of the page:
|
The output is a 1 page PDF document with the text "Hello World!" in Helvetica near the top of the page:
|
||||||
|
|
||||||
@ -68,25 +79,90 @@ The output is a 1 page PDF document with the text "Hello World!" in Helvetica ne
|
|||||||
|
|
||||||
Each font must be registered with the PdfDocumentBuilder prior to use enable pages to share the font resources. Only Standard 14 fonts and TrueType fonts (.ttf) are supported.
|
Each font must be registered with the PdfDocumentBuilder prior to use enable pages to share the font resources. Only Standard 14 fonts and TrueType fonts (.ttf) are supported.
|
||||||
|
|
||||||
|
### Advanced Document Extraction
|
||||||
|
In this example a more advanced document extraction is performed. PdfDocumentBuilder is used to create a copy of the pdf with debug information (bounding boxes and reading order) added.
|
||||||
|
|
||||||
|
|
||||||
|
```cs
|
||||||
|
//using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;
|
||||||
|
//using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||||
|
//using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor;
|
||||||
|
//using UglyToad.PdfPig.Fonts.Standard14Fonts;
|
||||||
|
|
||||||
|
var sourcePdfPath = "";
|
||||||
|
var outputPath = "";
|
||||||
|
var pageNumber = 1;
|
||||||
|
using (var document = PdfDocument.Open(sourcePdfPath))
|
||||||
|
{
|
||||||
|
var builder = new PdfDocumentBuilder { };
|
||||||
|
PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica);
|
||||||
|
var pageBuilder = builder.AddPage(document, pageNumber);
|
||||||
|
pageBuilder.SetStrokeColor(0, 255, 0);
|
||||||
|
var page = document.GetPage(pageNumber);
|
||||||
|
foreach (var word in page.GetWords())
|
||||||
|
{
|
||||||
|
|
||||||
|
var letters = page.Letters; // no preprocessing
|
||||||
|
|
||||||
|
// 1. Extract words
|
||||||
|
var wordExtractor = NearestNeighbourWordExtractor.Instance;
|
||||||
|
|
||||||
|
var words = wordExtractor.GetWords(letters);
|
||||||
|
|
||||||
|
// 2. Segment page
|
||||||
|
var pageSegmenter = DocstrumBoundingBoxes.Instance;
|
||||||
|
|
||||||
|
var textBlocks = pageSegmenter.GetBlocks(words);
|
||||||
|
|
||||||
|
// 3. Postprocessing
|
||||||
|
var readingOrder = UnsupervisedReadingOrderDetector.Instance;
|
||||||
|
var orderedTextBlocks = readingOrder.Get(textBlocks);
|
||||||
|
|
||||||
|
// 4. Add debug info - Bounding boxes and reading order
|
||||||
|
foreach (var block in orderedTextBlocks)
|
||||||
|
{
|
||||||
|
var bbox = block.BoundingBox;
|
||||||
|
pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height);
|
||||||
|
pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Write result to a file
|
||||||
|
byte[] fileBytes = builder.Build();
|
||||||
|
File.WriteAllBytes(outputPath, fileBytes); // save to file
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
See [Document Layout Analysis](https://github.com/UglyToad/PdfPig/wiki/Document-Layout-Analysis) for more information on advanced document analysing.
|
||||||
|
|
||||||
|
See [Export](https://github.com/UglyToad/PdfPig/wiki/Document-Layout-Analysis#export) for more advanced tooling to analyse document layouts.
|
||||||
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
### PdfDocument
|
||||||
|
|
||||||
The `PdfDocument` class provides access to the contents of a document loaded either from file or passed in as bytes. To open from a file use the `PdfDocument.Open` static method:
|
The `PdfDocument` class provides access to the contents of a document loaded either from file or passed in as bytes. To open from a file use the `PdfDocument.Open` static method:
|
||||||
|
|
||||||
using UglyToad.PdfPig;
|
```cs
|
||||||
using UglyToad.PdfPig.Content;
|
using UglyToad.PdfPig;
|
||||||
|
using UglyToad.PdfPig.Content;
|
||||||
|
|
||||||
using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf"))
|
using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf"))
|
||||||
{
|
{
|
||||||
int pageCount = document.NumberOfPages;
|
int pageCount = document.NumberOfPages;
|
||||||
|
|
||||||
// Page number starts from 1, not 0.
|
// Page number starts from 1, not 0.
|
||||||
Page page = document.GetPage(1);
|
Page page = document.GetPage(1);
|
||||||
|
|
||||||
decimal widthInPoints = page.Width;
|
decimal widthInPoints = page.Width;
|
||||||
decimal heightInPoints = page.Height;
|
decimal heightInPoints = page.Height;
|
||||||
|
|
||||||
string text = page.Text;
|
string text = page.Text;
|
||||||
}
|
}
|
||||||
|
```
|
||||||
|
|
||||||
`PdfDocument` should only be used in a `using` statement since it implements `IDisposable` (unless the consumer disposes of it elsewhere).
|
`PdfDocument` should only be used in a `using` statement since it implements `IDisposable` (unless the consumer disposes of it elsewhere).
|
||||||
|
|
||||||
@ -96,10 +172,12 @@ Encrypted documents can be opened by PdfPig. To provide an owner or user passwor
|
|||||||
|
|
||||||
You can also provide a list of passwords to try:
|
You can also provide a list of passwords to try:
|
||||||
|
|
||||||
using (PdfDocument document = PdfDocument.Open(@"C:\file.pdf", new ParsingOptions
|
```cs
|
||||||
{
|
using (PdfDocument document = PdfDocument.Open(@"C:\file.pdf", new ParsingOptions
|
||||||
Passwords = new List<string> { "One", "Two" }
|
{
|
||||||
}))
|
Passwords = new List<string> { "One", "Two" }
|
||||||
|
}))
|
||||||
|
```
|
||||||
|
|
||||||
The document contains the version of the PDF specification it complies with, accessed by `document.Version`:
|
The document contains the version of the PDF specification it complies with, accessed by `document.Version`:
|
||||||
|
|
||||||
@ -133,8 +211,10 @@ This creates a new `PdfPageBuilder` with the specified size. The first added pag
|
|||||||
|
|
||||||
To draw lines and rectangles use the methods:
|
To draw lines and rectangles use the methods:
|
||||||
|
|
||||||
void DrawLine(PdfPoint from, PdfPoint to, decimal lineWidth = 1)
|
```cs
|
||||||
void DrawRectangle(PdfPoint position, decimal width, decimal height, decimal lineWidth = 1)
|
void DrawLine(PdfPoint from, PdfPoint to, decimal lineWidth = 1)
|
||||||
|
void DrawRectangle(PdfPoint position, decimal width, decimal height, decimal lineWidth = 1)
|
||||||
|
```
|
||||||
|
|
||||||
The line width can be varied and defaults to 1. Rectangles are unfilled and the fill color cannot be changed at present.
|
The line width can be varied and defaults to 1. Rectangles are unfilled and the fill color cannot be changed at present.
|
||||||
|
|
||||||
@ -150,8 +230,10 @@ Which does not change the state of the page, unlike `AddText`.
|
|||||||
|
|
||||||
Changing the RGB color of text, lines and rectangles is supported using:
|
Changing the RGB color of text, lines and rectangles is supported using:
|
||||||
|
|
||||||
void SetStrokeColor(byte r, byte g, byte b)
|
```cs
|
||||||
void SetTextAndFillColor(byte r, byte g, byte b)
|
void SetStrokeColor(byte r, byte g, byte b)
|
||||||
|
void SetTextAndFillColor(byte r, byte g, byte b)
|
||||||
|
```
|
||||||
|
|
||||||
Which take RGB values between 0 and 255. The color will remain active for all operations called after these methods until reset is called using:
|
Which take RGB values between 0 and 255. The color will remain active for all operations called after these methods until reset is called using:
|
||||||
|
|
||||||
@ -163,14 +245,16 @@ Which resets the color for stroke, fill and text drawing to black.
|
|||||||
|
|
||||||
The `PdfDocument` provides access to the document metadata as `DocumentInformation` defined in the PDF file. These tend not to be provided therefore most of these entries will be `null`:
|
The `PdfDocument` provides access to the document metadata as `DocumentInformation` defined in the PDF file. These tend not to be provided therefore most of these entries will be `null`:
|
||||||
|
|
||||||
PdfDocument document = PdfDocument.Open(fileName);
|
```
|
||||||
|
PdfDocument document = PdfDocument.Open(fileName);
|
||||||
|
|
||||||
// The name of the program used to convert this document to PDF.
|
// The name of the program used to convert this document to PDF.
|
||||||
string producer = document.Information.Producer;
|
string producer = document.Information.Producer;
|
||||||
|
|
||||||
// The title given to the document
|
// The title given to the document
|
||||||
string title = document.Information.Title;
|
string title = document.Information.Title;
|
||||||
// etc...
|
// etc...
|
||||||
|
```
|
||||||
|
|
||||||
### Document Structure (0.0.3)
|
### Document Structure (0.0.3)
|
||||||
|
|
||||||
@ -180,8 +264,10 @@ The document now has a Structure member:
|
|||||||
|
|
||||||
This provides access to tokenized PDF document content:
|
This provides access to tokenized PDF document content:
|
||||||
|
|
||||||
Catalog catalog = structure.Catalog;
|
```cs
|
||||||
DictionaryToken pagesDictionary = catalog.PagesDictionary;
|
Catalog catalog = structure.Catalog;
|
||||||
|
DictionaryToken pagesDictionary = catalog.PagesDictionary;
|
||||||
|
```
|
||||||
|
|
||||||
The pages dictionary is the root of the pages tree within a PDF document. The structure also exposes a `GetObject(IndirectReference reference)` method which allows random access to any object in the PDF as long as its identifier number is known. This is an identifier of the form `69 0 R` where 69 is the object number and 0 is the generation.
|
The pages dictionary is the root of the pages tree within a PDF document. The structure also exposes a `GetObject(IndirectReference reference)` method which allows random access to any object in the PDF as long as its identifier number is known. This is an identifier of the form `69 0 R` where 69 is the object number and 0 is the generation.
|
||||||
|
|
||||||
@ -189,9 +275,12 @@ The pages dictionary is the root of the pages tree within a PDF document. The st
|
|||||||
|
|
||||||
The `Page` contains the page width and height in points as well as mapping to the `PageSize` enum:
|
The `Page` contains the page width and height in points as well as mapping to the `PageSize` enum:
|
||||||
|
|
||||||
PageSize size = Page.Size;
|
|
||||||
|
|
||||||
bool isA4 = size == PageSize.A4;
|
```cs
|
||||||
|
PageSize size = Page.Size;
|
||||||
|
|
||||||
|
bool isA4 = size == PageSize.A4;
|
||||||
|
```
|
||||||
|
|
||||||
`Page` provides access to the text of the page:
|
`Page` provides access to the text of the page:
|
||||||
|
|
||||||
@ -259,6 +348,8 @@ This will return `false` if the document does not contain a form.
|
|||||||
|
|
||||||
The fields can be accessed using the `AcroForm`'s `Fields` property. Since the form is defined at the document level this will return fields from all pages in the document. Fields are of the types defined by the enum `AcroFieldType`, for example `PushButton`, `Checkbox`, `Text`, etc.
|
The fields can be accessed using the `AcroForm`'s `Fields` property. Since the form is defined at the document level this will return fields from all pages in the document. Fields are of the types defined by the enum `AcroFieldType`, for example `PushButton`, `Checkbox`, `Text`, etc.
|
||||||
|
|
||||||
|
Please note the forms are readonly and values cannot be changed or added using PdfPig.
|
||||||
|
|
||||||
### Hyperlinks (0.1.0)
|
### Hyperlinks (0.1.0)
|
||||||
|
|
||||||
A page has a method to extract hyperlinks (annotations of link type):
|
A page has a method to extract hyperlinks (annotations of link type):
|
||||||
@ -269,12 +360,15 @@ A page has a method to extract hyperlinks (annotations of link type):
|
|||||||
|
|
||||||
The classes used to work with TrueType fonts in the PDF file are now available for public consumption. Given an input file:
|
The classes used to work with TrueType fonts in the PDF file are now available for public consumption. Given an input file:
|
||||||
|
|
||||||
using UglyToad.PdfPig.Fonts.TrueType;
|
|
||||||
using UglyToad.PdfPig.Fonts.TrueType.Parser;
|
|
||||||
|
|
||||||
byte[] fontBytes = System.IO.File.ReadAllBytes(@"C:\font.ttf");
|
```cs
|
||||||
TrueTypeDataBytes input = new TrueTypeDataBytes(fontBytes);
|
using UglyToad.PdfPig.Fonts.TrueType;
|
||||||
TrueTypeFont font = TrueTypeFontParser.Parse(input);
|
using UglyToad.PdfPig.Fonts.TrueType.Parser;
|
||||||
|
|
||||||
|
byte[] fontBytes = System.IO.File.ReadAllBytes(@"C:\font.ttf");
|
||||||
|
TrueTypeDataBytes input = new TrueTypeDataBytes(fontBytes);
|
||||||
|
TrueTypeFont font = TrueTypeFontParser.Parse(input);
|
||||||
|
```
|
||||||
|
|
||||||
The parsed font can then be inspected.
|
The parsed font can then be inspected.
|
||||||
|
|
||||||
@ -282,25 +376,31 @@ The parsed font can then be inspected.
|
|||||||
|
|
||||||
PDF files may contain other files entirely embedded inside them for document annotations. The list of embedded files and their byte content may be accessed:
|
PDF files may contain other files entirely embedded inside them for document annotations. The list of embedded files and their byte content may be accessed:
|
||||||
|
|
||||||
if (document.Advanced.TryGetEmbeddedFiles(out IReadOnlyList<EmbeddedFile> files)
|
```cs
|
||||||
&& files.Count > 0)
|
if (document.Advanced.TryGetEmbeddedFiles(out IReadOnlyList<EmbeddedFile> files)
|
||||||
{
|
&& files.Count > 0)
|
||||||
var firstFile = files[0];
|
{
|
||||||
string name = firstFile.Name;
|
var firstFile = files[0];
|
||||||
IReadOnlyList<byte> bytes = firstFile.Bytes;
|
string name = firstFile.Name;
|
||||||
}
|
IReadOnlyList<byte> bytes = firstFile.Bytes;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
### Merging (0.1.2)
|
### Merging (0.1.2)
|
||||||
|
|
||||||
You can merge 2 or more existing PDF files using the `PdfMerger` class:
|
You can merge 2 or more existing PDF files using the `PdfMerger` class:
|
||||||
|
|
||||||
var resultFileBytes = PdfMerger.Merge(filePath1, filePath2);
|
```cs
|
||||||
File.WriteAllBytes(@"C:\pdfs\outputfilename.pdf", resultFileBytes);
|
var resultFileBytes = PdfMerger.Merge(filePath1, filePath2);
|
||||||
|
File.WriteAllBytes(@"C:\pdfs\outputfilename.pdf", resultFileBytes);
|
||||||
|
```
|
||||||
|
|
||||||
## API Reference
|
## API Reference
|
||||||
|
|
||||||
If you wish to generate doxygen documentation, run `doxygen doxygen-docs` and open `docs/doxygen/html/index.html`.
|
If you wish to generate doxygen documentation, run `doxygen doxygen-docs` and open `docs/doxygen/html/index.html`.
|
||||||
|
|
||||||
|
See also the [wiki](https://github.com/UglyToad/PdfPig/wiki) for a detailed documentation on parts of the API
|
||||||
|
|
||||||
## Issues
|
## Issues
|
||||||
|
|
||||||
Please do file an issue if you encounter a bug.
|
Please do file an issue if you encounter a bug.
|
||||||
|
BIN
documentation/DrawingBoundingBoxes.jpg
Normal file
BIN
documentation/DrawingBoundingBoxes.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 191 KiB |
BIN
documentation/advancedBlockEditorExample.jpg
Normal file
BIN
documentation/advancedBlockEditorExample.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 154 KiB |
BIN
documentation/boundingBoxes_ReadingOrder.png
Normal file
BIN
documentation/boundingBoxes_ReadingOrder.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 130 KiB |
@ -0,0 +1,279 @@
|
|||||||
|
namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Text;
|
||||||
|
using UglyToad.PdfPig.Core;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the Thick Boundary Rectangle Relations (TBRR)
|
||||||
|
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||||
|
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||||
|
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
||||||
|
/// <para>See also https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para>
|
||||||
|
/// </summary>
|
||||||
|
public static class IntervalRelationsHelper
|
||||||
|
{
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
|
||||||
|
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||||
|
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||||
|
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="a"></param>
|
||||||
|
/// <param name="b"></param>
|
||||||
|
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
||||||
|
public static IntervalRelations GetRelationX(PdfRectangle a, PdfRectangle b, double T)
|
||||||
|
{
|
||||||
|
// Order is important
|
||||||
|
if (b.Left - T <= a.Left && a.Left <= b.Left + T
|
||||||
|
&& (b.Right - T <= a.Right && a.Right <= b.Right + T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.Equals;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (b.Left - T <= a.Right
|
||||||
|
&& a.Right <= b.Left + T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.Meets;
|
||||||
|
}
|
||||||
|
else if (a.Left - T <= b.Right
|
||||||
|
&& b.Right <= a.Left + T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.MeetsI;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (b.Left - T <= a.Left && a.Left <= b.Left + T
|
||||||
|
&& a.Right < b.Right - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.Starts;
|
||||||
|
}
|
||||||
|
else if (a.Left - T <= b.Left && b.Left <= a.Left + T
|
||||||
|
&& b.Right < a.Right - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.StartsI;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Left > b.Left + T
|
||||||
|
&& (b.Right - T <= a.Right && a.Right <= b.Right + T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.Finishes;
|
||||||
|
}
|
||||||
|
else if (b.Left > a.Left + T
|
||||||
|
&& (a.Right - T <= b.Right && b.Right <= a.Right + T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.FinishesI;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Left > b.Left + T
|
||||||
|
&& a.Right < b.Right - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.During;
|
||||||
|
}
|
||||||
|
else if (b.Left > a.Left + T
|
||||||
|
&& b.Right < a.Right - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.DuringI;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Left < b.Left - T
|
||||||
|
&& (b.Left + T < a.Right && a.Right < b.Right - T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.Overlaps;
|
||||||
|
}
|
||||||
|
else if (b.Left < a.Left - T
|
||||||
|
&& (a.Left + T < b.Right && b.Right < a.Right - T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.OverlapsI;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Right < b.Left - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.Precedes;
|
||||||
|
}
|
||||||
|
else if (b.Right < a.Left - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.PrecedesI;
|
||||||
|
}
|
||||||
|
|
||||||
|
return IntervalRelations.Unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
|
||||||
|
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
||||||
|
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
||||||
|
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
||||||
|
/// </summary>
|
||||||
|
/// <param name="a"></param>
|
||||||
|
/// <param name="b"></param>
|
||||||
|
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
||||||
|
public static IntervalRelations GetRelationY(PdfRectangle a, PdfRectangle b, double T)
|
||||||
|
{
|
||||||
|
// Order is important
|
||||||
|
if ((b.Top - T <= a.Top && a.Top <= b.Top + T)
|
||||||
|
&& (b.Bottom - T <= a.Bottom && a.Bottom <= b.Bottom + T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.Equals;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Top - T <= b.Bottom
|
||||||
|
&& b.Bottom <= a.Top + T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.MeetsI;
|
||||||
|
}
|
||||||
|
else if (b.Top - T <= a.Bottom
|
||||||
|
&& a.Bottom <= b.Top + T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.Meets;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (b.Top - T <= a.Top && a.Top <= b.Top + T
|
||||||
|
&& a.Bottom < b.Bottom - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.StartsI;
|
||||||
|
}
|
||||||
|
else if (a.Top - T <= b.Top && b.Top <= a.Top + T
|
||||||
|
&& b.Bottom < a.Bottom - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.Starts;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Top > b.Top + T
|
||||||
|
&& (b.Bottom - T <= a.Bottom && a.Bottom <= b.Bottom + T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.FinishesI;
|
||||||
|
}
|
||||||
|
else if (b.Top > a.Top + T
|
||||||
|
&& (a.Bottom - T <= b.Bottom && b.Bottom <= a.Bottom + T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.Finishes;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Top > b.Top + T
|
||||||
|
&& a.Bottom < b.Bottom - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.DuringI;
|
||||||
|
}
|
||||||
|
else if (b.Top > a.Top + T
|
||||||
|
&& b.Bottom < a.Bottom - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.During;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Top < b.Top - T
|
||||||
|
&& (b.Bottom + T < a.Top && a.Bottom < b.Bottom - T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.OverlapsI;
|
||||||
|
}
|
||||||
|
else if (b.Top < a.Top - T
|
||||||
|
&& (a.Bottom + T < b.Top && b.Bottom < a.Bottom - T))
|
||||||
|
{
|
||||||
|
return IntervalRelations.Overlaps;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.Bottom < b.Top - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.PrecedesI;
|
||||||
|
}
|
||||||
|
else if (b.Bottom < a.Top - T)
|
||||||
|
{
|
||||||
|
return IntervalRelations.Precedes;
|
||||||
|
}
|
||||||
|
|
||||||
|
return IntervalRelations.Unknown;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Allen’s interval thirteen relations.
|
||||||
|
/// <para>See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para>
|
||||||
|
/// </summary>
|
||||||
|
public enum IntervalRelations
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Unknown interval relations.
|
||||||
|
/// </summary>
|
||||||
|
Unknown,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// X takes place before Y.
|
||||||
|
/// <para>|____X____|----------------------</para>
|
||||||
|
/// <para>----------------------|____Y____|</para>
|
||||||
|
/// </summary>
|
||||||
|
Precedes,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// X meets Y.
|
||||||
|
/// <para>|_____X______|--------------</para>
|
||||||
|
/// <para>--------------|______Y_____|</para>
|
||||||
|
/// </summary>
|
||||||
|
Meets,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// X overlaps with Y.
|
||||||
|
/// <para>|________X________|-------------</para>
|
||||||
|
/// <para>-------------|________Y________|</para>
|
||||||
|
/// </summary>
|
||||||
|
Overlaps,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// X starts Y.
|
||||||
|
/// <para>|____X____|-----------------</para>
|
||||||
|
/// <para>|_______Y_______|-----------</para>
|
||||||
|
/// </summary>
|
||||||
|
Starts,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// X during Y.
|
||||||
|
/// <para>--------|____X____|---------</para>
|
||||||
|
/// <para>-----|_______Y________|-----</para>
|
||||||
|
/// </summary>
|
||||||
|
During,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// X finishes Y.
|
||||||
|
/// <para>-----------------|____X____|</para>
|
||||||
|
/// <para>-----------|_______Y_______|</para>
|
||||||
|
/// </summary>
|
||||||
|
Finishes,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Inverse precedes.
|
||||||
|
/// </summary>
|
||||||
|
PrecedesI,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Inverse meets.
|
||||||
|
/// </summary>
|
||||||
|
MeetsI,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Inverse overlaps.
|
||||||
|
/// </summary>
|
||||||
|
OverlapsI,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Inverse Starts.
|
||||||
|
/// </summary>
|
||||||
|
StartsI,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Inverse during.
|
||||||
|
/// </summary>
|
||||||
|
DuringI,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Inverse finishes.
|
||||||
|
/// </summary>
|
||||||
|
FinishesI,
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// X is equal to Y.
|
||||||
|
/// <para>----------|____X____|------------</para>
|
||||||
|
/// <para>----------|____Y____|------------</para>
|
||||||
|
/// </summary>
|
||||||
|
Equals
|
||||||
|
}
|
||||||
|
}
|
@ -188,8 +188,8 @@
|
|||||||
/// <param name="T">The tolerance parameter T.</param>
|
/// <param name="T">The tolerance parameter T.</param>
|
||||||
private static bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
|
private static bool GetBeforeInReading(TextBlock a, TextBlock b, double T)
|
||||||
{
|
{
|
||||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
|
||||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
|
||||||
|
|
||||||
return xRelation == IntervalRelations.Precedes ||
|
return xRelation == IntervalRelations.Precedes ||
|
||||||
yRelation == IntervalRelations.Precedes ||
|
yRelation == IntervalRelations.Precedes ||
|
||||||
@ -207,8 +207,8 @@
|
|||||||
/// <param name="T">The tolerance parameter T.</param>
|
/// <param name="T">The tolerance parameter T.</param>
|
||||||
private static bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
|
private static bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T)
|
||||||
{
|
{
|
||||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
|
||||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
|
||||||
|
|
||||||
return xRelation == IntervalRelations.Precedes ||
|
return xRelation == IntervalRelations.Precedes ||
|
||||||
xRelation == IntervalRelations.Meets ||
|
xRelation == IntervalRelations.Meets ||
|
||||||
@ -237,8 +237,8 @@
|
|||||||
/// <param name="T">The tolerance parameter T.</param>
|
/// <param name="T">The tolerance parameter T.</param>
|
||||||
private static bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
|
private static bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T)
|
||||||
{
|
{
|
||||||
IntervalRelations xRelation = GetIntervalRelationX(a, b, T);
|
IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T);
|
||||||
IntervalRelations yRelation = GetIntervalRelationY(a, b, T);
|
IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T);
|
||||||
|
|
||||||
return yRelation == IntervalRelations.Precedes ||
|
return yRelation == IntervalRelations.Precedes ||
|
||||||
yRelation == IntervalRelations.Meets ||
|
yRelation == IntervalRelations.Meets ||
|
||||||
@ -259,263 +259,5 @@
|
|||||||
yRelation == IntervalRelations.OverlapsI));
|
yRelation == IntervalRelations.OverlapsI));
|
||||||
}
|
}
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate.
|
|
||||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
|
||||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
|
||||||
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="a"></param>
|
|
||||||
/// <param name="b"></param>
|
|
||||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
|
||||||
private static IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T)
|
|
||||||
{
|
|
||||||
if (a.BoundingBox.Right < b.BoundingBox.Left - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.Precedes;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Right >= b.BoundingBox.Left - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.PrecedesI;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (b.BoundingBox.Left - T <= a.BoundingBox.Right
|
|
||||||
&& a.BoundingBox.Right <= b.BoundingBox.Left + T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.Meets;
|
|
||||||
}
|
|
||||||
else if (b.BoundingBox.Left - T > a.BoundingBox.Right
|
|
||||||
&& a.BoundingBox.Right > b.BoundingBox.Left + T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.MeetsI;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (a.BoundingBox.Left < b.BoundingBox.Left - T
|
|
||||||
&& (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.Overlaps;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Left >= b.BoundingBox.Left - T
|
|
||||||
&& (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.OverlapsI;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
|
|
||||||
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.Starts;
|
|
||||||
}
|
|
||||||
else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T
|
|
||||||
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.StartsI;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
|
|
||||||
&& a.BoundingBox.Right < b.BoundingBox.Right - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.During;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
|
|
||||||
&& a.BoundingBox.Right >= b.BoundingBox.Right - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.DuringI;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (a.BoundingBox.Left > b.BoundingBox.Left + T
|
|
||||||
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.Finishes;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Left <= b.BoundingBox.Left + T
|
|
||||||
&& (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.FinishesI;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T
|
|
||||||
&& (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.Equals;
|
|
||||||
}
|
|
||||||
|
|
||||||
return IntervalRelations.Unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate.
|
|
||||||
/// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page.
|
|
||||||
/// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed
|
|
||||||
/// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para>
|
|
||||||
/// </summary>
|
|
||||||
/// <param name="a"></param>
|
|
||||||
/// <param name="b"></param>
|
|
||||||
/// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param>
|
|
||||||
private static IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T)
|
|
||||||
{
|
|
||||||
if (a.BoundingBox.Bottom < b.BoundingBox.Top - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.PrecedesI;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.Precedes;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom
|
|
||||||
&& a.BoundingBox.Bottom <= b.BoundingBox.Top + T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.MeetsI;
|
|
||||||
}
|
|
||||||
else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom
|
|
||||||
&& a.BoundingBox.Bottom > b.BoundingBox.Top + T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.Meets;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (a.BoundingBox.Top < b.BoundingBox.Top - T
|
|
||||||
&& (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.OverlapsI;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Top >= b.BoundingBox.Top - T
|
|
||||||
&& (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.Overlaps;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T
|
|
||||||
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.StartsI;
|
|
||||||
}
|
|
||||||
else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T
|
|
||||||
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.Starts;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
|
|
||||||
&& a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.DuringI;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
|
|
||||||
&& a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)
|
|
||||||
{
|
|
||||||
return IntervalRelations.During;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if (a.BoundingBox.Top > b.BoundingBox.Top + T
|
|
||||||
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.FinishesI;
|
|
||||||
}
|
|
||||||
else if (a.BoundingBox.Top <= b.BoundingBox.Top + T
|
|
||||||
&& (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.Finishes;
|
|
||||||
}
|
|
||||||
|
|
||||||
else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T)
|
|
||||||
&& (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T))
|
|
||||||
{
|
|
||||||
return IntervalRelations.Equals;
|
|
||||||
}
|
|
||||||
|
|
||||||
return IntervalRelations.Unknown;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Allen’s interval thirteen relations.
|
|
||||||
/// <para>See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para>
|
|
||||||
/// </summary>
|
|
||||||
private enum IntervalRelations
|
|
||||||
{
|
|
||||||
/// <summary>
|
|
||||||
/// Unknown interval relations.
|
|
||||||
/// </summary>
|
|
||||||
Unknown,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// X takes place before Y.
|
|
||||||
/// <para>|____X____|......................</para>
|
|
||||||
/// <para>......................|____Y____|</para>
|
|
||||||
/// </summary>
|
|
||||||
Precedes,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// X meets Y.
|
|
||||||
/// <para>|____X____|.................</para>
|
|
||||||
/// <para>.................|____Y____|</para>
|
|
||||||
/// </summary>
|
|
||||||
Meets,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// X overlaps with Y.
|
|
||||||
/// <para>|______X______|.................</para>
|
|
||||||
/// <para>.................|______Y______|</para>
|
|
||||||
/// </summary>
|
|
||||||
Overlaps,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// X starts Y.
|
|
||||||
/// <para>|____X____|.................</para>
|
|
||||||
/// <para>|_____Y_____|..............</para>
|
|
||||||
/// </summary>
|
|
||||||
Starts,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// X during Y.
|
|
||||||
/// <para>........|____X____|.........</para>
|
|
||||||
/// <para>.....|______Y______|.....</para>
|
|
||||||
/// </summary>
|
|
||||||
During,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// X finishes Y.
|
|
||||||
/// <para>.................|____X____|</para>
|
|
||||||
/// <para>..............|_____Y_____|</para>
|
|
||||||
/// </summary>
|
|
||||||
Finishes,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Inverse precedes.
|
|
||||||
/// </summary>
|
|
||||||
PrecedesI,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Inverse meets.
|
|
||||||
/// </summary>
|
|
||||||
MeetsI,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Inverse overlaps.
|
|
||||||
/// </summary>
|
|
||||||
OverlapsI,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Inverse Starts.
|
|
||||||
/// </summary>
|
|
||||||
StartsI,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Inverse during.
|
|
||||||
/// </summary>
|
|
||||||
DuringI,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// Inverse finishes.
|
|
||||||
/// </summary>
|
|
||||||
FinishesI,
|
|
||||||
|
|
||||||
/// <summary>
|
|
||||||
/// X is equal to Y.
|
|
||||||
/// <para>..........|____X____|............</para>
|
|
||||||
/// <para>..........|____Y____|............</para>
|
|
||||||
/// </summary>
|
|
||||||
Equals
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
319
src/UglyToad.PdfPig.Tests/Dla/IntervalRelationsHelperTests.cs
Normal file
319
src/UglyToad.PdfPig.Tests/Dla/IntervalRelationsHelperTests.cs
Normal file
@ -0,0 +1,319 @@
|
|||||||
|
namespace UglyToad.PdfPig.Tests.Dla
|
||||||
|
{
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using UglyToad.PdfPig.Content;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||||
|
using UglyToad.PdfPig.Core;
|
||||||
|
|
||||||
|
public class IntervalRelationsHelperTests
|
||||||
|
{
|
||||||
|
// Note (0,0) is bottom left of page
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A is equal to B.
|
||||||
|
/// <para>----------|____A____|------------</para>
|
||||||
|
/// <para>----------|____B____|------------</para>
|
||||||
|
/// </summary>
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Equals_X()
|
||||||
|
{
|
||||||
|
var a = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10));
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Equals, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Equals_Y()
|
||||||
|
{
|
||||||
|
var a = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10));
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Equals, res);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Precedes: A takes place before B.
|
||||||
|
/// <para>|____A____|----------------------</para>
|
||||||
|
/// <para>----------------------|____B____|</para>
|
||||||
|
/// </summary>
|
||||||
|
///
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Precedes_X()
|
||||||
|
{
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft();
|
||||||
|
var b = PdfPointTestExtensions.BoxAtTopLeft().MoveLeft(100);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Precedes, res);
|
||||||
|
Assert.Equal(IntervalRelations.PrecedesI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Precedes_Y()
|
||||||
|
{
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft();
|
||||||
|
var b = a.MoveDown(200);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Precedes, res);
|
||||||
|
Assert.Equal(IntervalRelations.PrecedesI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A meets B.
|
||||||
|
/// <para>|_____A______|--------------</para>
|
||||||
|
/// <para>--------------|______B_____|</para>
|
||||||
|
/// </summary>
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Meets_X()
|
||||||
|
{
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
|
||||||
|
var b = a.MoveLeft(100);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Meets, res);
|
||||||
|
Assert.Equal(IntervalRelations.MeetsI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A meets B.
|
||||||
|
/// <para>|_____A______|--------------</para>
|
||||||
|
/// <para>--------------|______B_____|</para>
|
||||||
|
/// </summary>
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Meets_X_WithinTolerance()
|
||||||
|
{
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
|
||||||
|
var b = a.MoveLeft(110);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, b, 11);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 11);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Meets, res);
|
||||||
|
Assert.Equal(IntervalRelations.MeetsI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Meets_Y()
|
||||||
|
{
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
|
||||||
|
var b = a.MoveDown(100);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Meets, res);
|
||||||
|
Assert.Equal(IntervalRelations.MeetsI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Meets_Y_WhenMovedDown_BecomesPreceeds()
|
||||||
|
{
|
||||||
|
// We take an A B that meets and move the B further down so becomes preceeds
|
||||||
|
var startPoint = new PdfPoint(100, 600);
|
||||||
|
var a = new PdfRectangle(startPoint, startPoint.MoveDown(100));
|
||||||
|
var meetsABox = a.MoveDown(100);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, meetsABox, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationY(meetsABox, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Meets, res);
|
||||||
|
Assert.Equal(IntervalRelations.MeetsI, resInverse);
|
||||||
|
|
||||||
|
var preceededByABox = meetsABox.MoveDown(100);
|
||||||
|
|
||||||
|
|
||||||
|
var moveRes = IntervalRelationsHelper.GetRelationY(a, preceededByABox, 5);
|
||||||
|
var moveResInverse = IntervalRelationsHelper.GetRelationY(preceededByABox, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Precedes, moveRes);
|
||||||
|
Assert.Equal(IntervalRelations.PrecedesI, moveResInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A overlaps with B.
|
||||||
|
/// <para>|________A________|-------------</para>
|
||||||
|
/// <para>-------------|________B________|</para>
|
||||||
|
/// </summary>
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Overlaps_X()
|
||||||
|
{
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
|
||||||
|
var b = a.MoveLeft(a.Width/2);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Overlaps, res);
|
||||||
|
Assert.Equal(IntervalRelations.OverlapsI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Overlaps_Y()
|
||||||
|
{
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft(100);
|
||||||
|
var b = a.MoveLeft(500).MoveDown(a.Height / 2); // Only the move down is important
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Overlaps, res);
|
||||||
|
Assert.Equal(IntervalRelations.OverlapsI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A starts B.
|
||||||
|
/// <para>|____A____|-----------------</para>
|
||||||
|
/// <para>|_______B_______|-----------</para>
|
||||||
|
/// </summary>
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Starts_X()
|
||||||
|
{
|
||||||
|
var topLeft = PdfPointTestExtensions.OriginTopLeft();
|
||||||
|
var a = new PdfRectangle(topLeft, topLeft.MoveLeft(50).MoveDown(10));
|
||||||
|
var b = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(10));
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Starts, res);
|
||||||
|
Assert.Equal(IntervalRelations.StartsI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Starts_Y()
|
||||||
|
{
|
||||||
|
var topLeft = PdfPointTestExtensions.OriginTopLeft();
|
||||||
|
var a = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(100));
|
||||||
|
var b = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(200));
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Starts, res);
|
||||||
|
Assert.Equal(IntervalRelations.StartsI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A during B.
|
||||||
|
/// <para>--------|____A____|---------</para>
|
||||||
|
/// <para>-----|_______B________|-----</para>
|
||||||
|
/// </summary>
|
||||||
|
///During,
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_During_X()
|
||||||
|
{
|
||||||
|
var a = new PdfRectangle(new PdfPoint(20, 0), new PdfPoint(80, 0));
|
||||||
|
var b = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(100, 0));
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.During, res);
|
||||||
|
Assert.Equal(IntervalRelations.DuringI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_During_Y()
|
||||||
|
{
|
||||||
|
var a = new PdfRectangle(new PdfPoint(0, 20), new PdfPoint(0, 80));
|
||||||
|
var b = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(0, 100));
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.During, res);
|
||||||
|
Assert.Equal(IntervalRelations.DuringI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// A finishes B.
|
||||||
|
/// <para>-----------------|____A____|</para>
|
||||||
|
/// <para>-----------|_______B_______|</para>
|
||||||
|
/// </summary>
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Finishes_X()
|
||||||
|
{
|
||||||
|
var topRight = PdfPointTestExtensions.OriginTopLeft().MoveLeft(400);
|
||||||
|
var a = new PdfRectangle(topRight.MoveX(-100), topRight);
|
||||||
|
var b = new PdfRectangle(topRight.MoveX(-200), topRight);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationX(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Finishes, res);
|
||||||
|
Assert.Equal(IntervalRelations.FinishesI, resInverse);
|
||||||
|
}
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void IntervalRelation_Finishes_Y()
|
||||||
|
{
|
||||||
|
var topleft = PdfPointTestExtensions.OriginTopLeft();
|
||||||
|
var a = PdfPointTestExtensions.BoxAtTopLeft(20).MoveDown(20);
|
||||||
|
var b = PdfPointTestExtensions.BoxAtTopLeft(40);
|
||||||
|
|
||||||
|
var res = IntervalRelationsHelper.GetRelationY(a, b, 5);
|
||||||
|
var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5);
|
||||||
|
|
||||||
|
Assert.Equal(IntervalRelations.Finishes, res);
|
||||||
|
Assert.Equal(IntervalRelations.FinishesI, resInverse);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static class PdfPointTestExtensions
|
||||||
|
{
|
||||||
|
|
||||||
|
internal static PdfPoint OriginTopLeft()
|
||||||
|
{
|
||||||
|
return new PdfPoint(0, 800);
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static PdfPoint MoveLeft(this PdfPoint it, double dist)
|
||||||
|
{
|
||||||
|
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
|
||||||
|
|
||||||
|
return it.MoveX(dist);
|
||||||
|
}
|
||||||
|
internal static PdfPoint MoveDown(this PdfPoint it, double dist)
|
||||||
|
{
|
||||||
|
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
|
||||||
|
|
||||||
|
return it.MoveY(-dist);
|
||||||
|
}
|
||||||
|
|
||||||
|
internal static PdfRectangle BoxAtTopLeft(double length = 10d)
|
||||||
|
{
|
||||||
|
return new PdfRectangle(OriginTopLeft(), OriginTopLeft().MoveLeft(length).MoveDown(length));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
internal static PdfRectangle MoveLeft(this PdfRectangle start, double dist)
|
||||||
|
{
|
||||||
|
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
|
||||||
|
|
||||||
|
return new PdfRectangle(start.BottomLeft.MoveLeft(dist), start.TopRight.MoveLeft(dist));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
internal static PdfRectangle MoveDown(this PdfRectangle start, double dist)
|
||||||
|
{
|
||||||
|
if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive");
|
||||||
|
|
||||||
|
return new PdfRectangle(start.BottomLeft.MoveDown(dist), start.TopRight.MoveDown(dist));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,73 @@
|
|||||||
|
namespace UglyToad.PdfPig.Tests.Dla
|
||||||
|
{
|
||||||
|
using System;
|
||||||
|
using System.Collections.Generic;
|
||||||
|
using System.Linq;
|
||||||
|
using System.Text;
|
||||||
|
using System.Threading.Tasks;
|
||||||
|
using UglyToad.PdfPig.Content;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis;
|
||||||
|
using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector;
|
||||||
|
using UglyToad.PdfPig.Core;
|
||||||
|
|
||||||
|
public class UnsupervisedReadingOrderTests
|
||||||
|
{
|
||||||
|
[Fact]
|
||||||
|
public void ReadingOrderOrdersItemsOnTheSameRowContents()
|
||||||
|
{
|
||||||
|
TextBlock leftTextBlock = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10)));
|
||||||
|
TextBlock rightTextBlock = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(100, 0), new PdfPoint(110, 10)));
|
||||||
|
|
||||||
|
// We deliberately submit in the wrong order
|
||||||
|
var textBlocks = new List<TextBlock>() { rightTextBlock, leftTextBlock };
|
||||||
|
|
||||||
|
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, UnsupervisedReadingOrderDetector.SpatialReasoningRules.RowWise);
|
||||||
|
var orderedBlocks = unsupervisedReadingOrderDetector.Get(textBlocks);
|
||||||
|
|
||||||
|
var ordered = orderedBlocks.OrderBy(x => x.ReadingOrder).ToList();
|
||||||
|
Assert.Equal(0, ordered[0].BoundingBox.Left);
|
||||||
|
Assert.Equal(100, ordered[1].BoundingBox.Left);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
[Fact]
|
||||||
|
public void DocumentTest()
|
||||||
|
{
|
||||||
|
var title = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 709.06), new PdfPoint(x: 42.6, y: 709.06)));
|
||||||
|
var line1_Left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 668.86), new PdfPoint(x: 42.6, y: 668.86)));
|
||||||
|
var line1_Right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 668.86), new PdfPoint(x: 302.21, y: 668.86)));
|
||||||
|
var line2_Left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 608.26), new PdfPoint(x: 42.6, y: 608.26)));
|
||||||
|
var line2_Taller_Right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 581.35), new PdfPoint(x: 302.21, y: 581.35)));
|
||||||
|
var line3 = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 515.83), new PdfPoint(x: 42.6, y: 515.83)));
|
||||||
|
var line4_left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 490.27), new PdfPoint(x: 42.6, y: 490.27)));
|
||||||
|
var line4_right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 491.59), new PdfPoint(x: 302.21, y: 491.59)));
|
||||||
|
|
||||||
|
// We deliberately submit in the wrong order
|
||||||
|
var textBlocks = new List<TextBlock>() { title, line4_left, line2_Taller_Right, line4_right, line1_Right, line1_Left, line3, line2_Left };
|
||||||
|
|
||||||
|
var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, UnsupervisedReadingOrderDetector.SpatialReasoningRules.RowWise);
|
||||||
|
var orderedBlocks = unsupervisedReadingOrderDetector.Get(textBlocks);
|
||||||
|
|
||||||
|
var ordered = orderedBlocks.OrderBy(x => x.ReadingOrder).ToList();
|
||||||
|
Assert.Equal(title.BoundingBox, ordered[0].BoundingBox);
|
||||||
|
Assert.Equal(line1_Left.BoundingBox, ordered[1].BoundingBox);
|
||||||
|
Assert.Equal(line1_Right.BoundingBox, ordered[2].BoundingBox);
|
||||||
|
Assert.Equal(line2_Left.BoundingBox, ordered[3].BoundingBox);
|
||||||
|
Assert.Equal(line2_Taller_Right.BoundingBox, ordered[4].BoundingBox);
|
||||||
|
Assert.Equal(line3.BoundingBox, ordered[5].BoundingBox);
|
||||||
|
Assert.Equal(line4_left.BoundingBox, ordered[6].BoundingBox);
|
||||||
|
Assert.Equal(line4_right.BoundingBox, ordered[7].BoundingBox);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TextBlock CreateFakeTextBlock(PdfRectangle boundingBox)
|
||||||
|
{
|
||||||
|
var letter = new Letter("a",
|
||||||
|
boundingBox,
|
||||||
|
boundingBox.BottomLeft,
|
||||||
|
boundingBox.BottomRight,
|
||||||
|
10, 1, null, TextRenderingMode.NeitherClip, null, null, 0, 0);// These don't matter
|
||||||
|
var leftTextBlock = new TextBlock(new[] { new TextLine(new[] { new Word(new[] { letter }) }) });
|
||||||
|
return leftTextBlock;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user