mirror of
				https://github.com/UglyToad/PdfPig.git
				synced 2025-10-31 16:46:55 +08:00 
			
		
		
		
	* #836 Fix UnsupervisedReadingOrder orders 2 blocks on the same row out of order Add images for documentation * Update Documentation: Additional example, Reference to wiki * Change code formating to C# on documentation * Fix link in documentation * Fix Spelling --------- Co-authored-by: David <David@david>
This commit is contained in:
		
							
								
								
									
										222
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										222
									
								
								README.md
									
									
									
									
									
								
							| @@ -12,6 +12,9 @@ This project aims to port [PDFBox](https://github.com/apache/pdfbox) to C#. | ||||
|  | ||||
| **Migrating to 0.1.6 from 0.1.x?** Use this guide: [migration to 0.1.6](https://github.com/UglyToad/PdfPig/wiki/Migration-to-0.1.6). | ||||
|  | ||||
| ## Wiki | ||||
| Check out our [wiki](https://github.com/UglyToad/PdfPig/wiki) for more examples and detailed guides on the API. | ||||
|  | ||||
| ## Installation | ||||
|  | ||||
| The package is available via the releases tab or from Nuget: | ||||
| @@ -26,20 +29,25 @@ While the version is below 1.0.0 minor versions will change the public API witho | ||||
|  | ||||
| ## Get Started | ||||
|  | ||||
| See the [wiki](https://github.com/UglyToad/PdfPig/wiki) for more examples  | ||||
|  | ||||
| ### Read words in a page | ||||
| The simplest usage at this stage is to open a document, reading the words from every page: | ||||
|  | ||||
|     using (PdfDocument document = PdfDocument.Open(@"C:\Documents\document.pdf")) | ||||
|     { | ||||
|         foreach (Page page in document.GetPages()) | ||||
|         { | ||||
|             string pageText = page.Text; | ||||
| ```cs | ||||
| using (PdfDocument document = PdfDocument.Open(@"C:\Documents\document.pdf")) | ||||
| { | ||||
| 	foreach (Page page in document.GetPages()) | ||||
| 	{ | ||||
| 		string pageText = page.Text; | ||||
|  | ||||
|             foreach (Word word in page.GetWords()) | ||||
|             { | ||||
|                 Console.WriteLine(word.Text); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 		foreach (Word word in page.GetWords()) | ||||
| 		{ | ||||
| 			Console.WriteLine(word.Text); | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| ``` | ||||
|  | ||||
| An example of the output of this is shown below: | ||||
|  | ||||
| @@ -47,20 +55,23 @@ An example of the output of this is shown below: | ||||
|  | ||||
| Where for the PDF text ("Write something in") shown at the top the 3 words (in pink) are detected and each word contains the individual letters with glyph bounding boxes. | ||||
|  | ||||
| ### Ceate PDF Document | ||||
| To create documents use the class `PdfDocumentBuilder`. The Standard 14 fonts provide a quick way to get started: | ||||
|  | ||||
|     PdfDocumentBuilder builder = new PdfDocumentBuilder(); | ||||
| ```cs | ||||
| PdfDocumentBuilder builder = new PdfDocumentBuilder(); | ||||
|  | ||||
|     PdfPageBuilder page = builder.AddPage(PageSize.A4); | ||||
| PdfPageBuilder page = builder.AddPage(PageSize.A4); | ||||
|  | ||||
|     // Fonts must be registered with the document builder prior to use to prevent duplication. | ||||
|     PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica); | ||||
| // Fonts must be registered with the document builder prior to use to prevent duplication. | ||||
| PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica); | ||||
|  | ||||
|     page.AddText("Hello World!", 12, new PdfPoint(25, 700), font); | ||||
| page.AddText("Hello World!", 12, new PdfPoint(25, 700), font); | ||||
|  | ||||
|     byte[] documentBytes = builder.Build(); | ||||
| byte[] documentBytes = builder.Build(); | ||||
|  | ||||
|     File.WriteAllBytes(@"C:\git\newPdf.pdf", documentBytes); | ||||
| File.WriteAllBytes(@"C:\git\newPdf.pdf", documentBytes); | ||||
| ``` | ||||
|  | ||||
| The output is a 1 page PDF document with the text "Hello World!" in Helvetica near the top of the page: | ||||
|  | ||||
| @@ -68,25 +79,90 @@ The output is a 1 page PDF document with the text "Hello World!" in Helvetica ne | ||||
|  | ||||
| Each font must be registered with the PdfDocumentBuilder prior to use enable pages to share the font resources. Only Standard 14 fonts and TrueType fonts (.ttf) are supported. | ||||
|  | ||||
| ### Advanced Document Extraction | ||||
| In this example a more advanced document extraction is performed. PdfDocumentBuilder is used to create a copy of the pdf with debug information (bounding boxes and reading order) added. | ||||
|  | ||||
|  | ||||
| ```cs | ||||
| //using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; | ||||
| //using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; | ||||
| //using UglyToad.PdfPig.DocumentLayoutAnalysis.WordExtractor; | ||||
| //using UglyToad.PdfPig.Fonts.Standard14Fonts; | ||||
|  | ||||
| var sourcePdfPath = ""; | ||||
| var outputPath = ""; | ||||
| var pageNumber = 1; | ||||
| using (var document = PdfDocument.Open(sourcePdfPath)) | ||||
| { | ||||
| 	var builder = new PdfDocumentBuilder { }; | ||||
| 	PdfDocumentBuilder.AddedFont font = builder.AddStandard14Font(Standard14Font.Helvetica); | ||||
| 	var pageBuilder = builder.AddPage(document, pageNumber); | ||||
| 	pageBuilder.SetStrokeColor(0, 255, 0); | ||||
| 	var page = document.GetPage(pageNumber); | ||||
| 	foreach (var word in page.GetWords()) | ||||
| 	{ | ||||
|  | ||||
| 		var letters = page.Letters; // no preprocessing | ||||
|  | ||||
| 		// 1. Extract words | ||||
| 		var wordExtractor = NearestNeighbourWordExtractor.Instance; | ||||
|  | ||||
| 		var words = wordExtractor.GetWords(letters); | ||||
|  | ||||
| 		// 2. Segment page | ||||
| 		var pageSegmenter = DocstrumBoundingBoxes.Instance; | ||||
|  | ||||
| 		var textBlocks = pageSegmenter.GetBlocks(words); | ||||
|  | ||||
| 		// 3. Postprocessing | ||||
| 		var readingOrder = UnsupervisedReadingOrderDetector.Instance; | ||||
| 		var orderedTextBlocks = readingOrder.Get(textBlocks); | ||||
|  | ||||
| 		// 4. Add debug info - Bounding boxes and reading order | ||||
| 		foreach (var block in orderedTextBlocks) | ||||
| 		{ | ||||
| 			var bbox = block.BoundingBox; | ||||
| 			pageBuilder.DrawRectangle(bbox.BottomLeft, bbox.Width, bbox.Height); | ||||
| 			pageBuilder.AddText(block.ReadingOrder.ToString(), 8, bbox, font); | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// 5. Write result to a file | ||||
| 	byte[] fileBytes = builder.Build(); | ||||
| 	File.WriteAllBytes(outputPath, fileBytes); // save to file | ||||
| } | ||||
| ``` | ||||
|  | ||||
|  | ||||
|  | ||||
| See [Document Layout Analysis](https://github.com/UglyToad/PdfPig/wiki/Document-Layout-Analysis) for more information on advanced document analysing. | ||||
|  | ||||
| See [Export](https://github.com/UglyToad/PdfPig/wiki/Document-Layout-Analysis#export) for more advanced tooling to analyse document layouts. | ||||
|  | ||||
|  | ||||
| ## Usage | ||||
|  | ||||
| ### PdfDocument | ||||
|  | ||||
| The `PdfDocument` class provides access to the contents of a document loaded either from file or passed in as bytes. To open from a file use the `PdfDocument.Open` static method: | ||||
|  | ||||
|     using UglyToad.PdfPig; | ||||
|     using UglyToad.PdfPig.Content; | ||||
| ```cs | ||||
| using UglyToad.PdfPig; | ||||
| using UglyToad.PdfPig.Content; | ||||
|  | ||||
|     using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf")) | ||||
|     { | ||||
|         int pageCount = document.NumberOfPages; | ||||
| using (PdfDocument document = PdfDocument.Open(@"C:\my-file.pdf")) | ||||
| { | ||||
| 	int pageCount = document.NumberOfPages; | ||||
|  | ||||
|         // Page number starts from 1, not 0. | ||||
|         Page page = document.GetPage(1); | ||||
| 	// Page number starts from 1, not 0. | ||||
| 	Page page = document.GetPage(1); | ||||
|  | ||||
|         decimal widthInPoints = page.Width; | ||||
|         decimal heightInPoints = page.Height; | ||||
| 	decimal widthInPoints = page.Width; | ||||
| 	decimal heightInPoints = page.Height; | ||||
|  | ||||
|         string text = page.Text; | ||||
|     } | ||||
| 	string text = page.Text; | ||||
| } | ||||
| ``` | ||||
|  | ||||
| `PdfDocument` should only be used in a `using` statement since it implements `IDisposable` (unless the consumer disposes of it elsewhere). | ||||
|  | ||||
| @@ -96,10 +172,12 @@ Encrypted documents can be opened by PdfPig. To provide an owner or user passwor | ||||
|  | ||||
| You can also provide a list of passwords to try: | ||||
|  | ||||
|     using (PdfDocument document = PdfDocument.Open(@"C:\file.pdf", new ParsingOptions | ||||
|     { | ||||
|         Passwords = new List<string> { "One", "Two" } | ||||
|     })) | ||||
| ```cs | ||||
| using (PdfDocument document = PdfDocument.Open(@"C:\file.pdf", new ParsingOptions | ||||
| { | ||||
| 	Passwords = new List<string> { "One", "Two" } | ||||
| })) | ||||
| ``` | ||||
|  | ||||
| The document contains the version of the PDF specification it complies with, accessed by `document.Version`: | ||||
|  | ||||
| @@ -133,8 +211,10 @@ This creates a new `PdfPageBuilder` with the specified size. The first added pag | ||||
|  | ||||
| To draw lines and rectangles use the methods: | ||||
|  | ||||
|     void DrawLine(PdfPoint from, PdfPoint to, decimal lineWidth = 1) | ||||
|     void DrawRectangle(PdfPoint position, decimal width, decimal height, decimal lineWidth = 1) | ||||
| ```cs | ||||
| void DrawLine(PdfPoint from, PdfPoint to, decimal lineWidth = 1) | ||||
| void DrawRectangle(PdfPoint position, decimal width, decimal height, decimal lineWidth = 1) | ||||
| ``` | ||||
|  | ||||
| The line width can be varied and defaults to 1. Rectangles are unfilled and the fill color cannot be changed at present. | ||||
|  | ||||
| @@ -150,8 +230,10 @@ Which does not change the state of the page, unlike `AddText`. | ||||
|  | ||||
| Changing the RGB color of text, lines and rectangles is supported using: | ||||
|  | ||||
|     void SetStrokeColor(byte r, byte g, byte b) | ||||
|     void SetTextAndFillColor(byte r, byte g, byte b) | ||||
| ```cs | ||||
| void SetStrokeColor(byte r, byte g, byte b) | ||||
| void SetTextAndFillColor(byte r, byte g, byte b) | ||||
| ``` | ||||
|  | ||||
| Which take RGB values between 0 and 255. The color will remain active for all operations called after these methods until reset is called using: | ||||
|  | ||||
| @@ -163,14 +245,16 @@ Which resets the color for stroke, fill and text drawing to black. | ||||
|  | ||||
| The `PdfDocument` provides access to the document metadata as `DocumentInformation` defined in the PDF file. These tend not to be provided therefore most of these entries will be `null`: | ||||
|  | ||||
|     PdfDocument document = PdfDocument.Open(fileName); | ||||
| ``` | ||||
| PdfDocument document = PdfDocument.Open(fileName); | ||||
|  | ||||
|     // The name of the program used to convert this document to PDF. | ||||
|     string producer = document.Information.Producer; | ||||
| // The name of the program used to convert this document to PDF. | ||||
| string producer = document.Information.Producer; | ||||
|  | ||||
|     // The title given to the document | ||||
|     string title = document.Information.Title; | ||||
|     // etc... | ||||
| // The title given to the document | ||||
| string title = document.Information.Title; | ||||
| // etc... | ||||
| ``` | ||||
|  | ||||
| ### Document Structure (0.0.3) | ||||
|  | ||||
| @@ -180,8 +264,10 @@ The document now has a Structure member: | ||||
|  | ||||
| This provides access to tokenized PDF document content: | ||||
|  | ||||
|     Catalog catalog = structure.Catalog; | ||||
|     DictionaryToken pagesDictionary = catalog.PagesDictionary; | ||||
| ```cs | ||||
| Catalog catalog = structure.Catalog; | ||||
| DictionaryToken pagesDictionary = catalog.PagesDictionary; | ||||
| ``` | ||||
|  | ||||
| The pages dictionary is the root of the pages tree within a PDF document. The structure also exposes a `GetObject(IndirectReference reference)` method which allows random access to any object in the PDF as long as its identifier number is known. This is an identifier of the form `69 0 R` where 69 is the object number and 0 is the generation. | ||||
|  | ||||
| @@ -189,9 +275,12 @@ The pages dictionary is the root of the pages tree within a PDF document. The st | ||||
|  | ||||
| The `Page` contains the page width and height in points as well as mapping to the `PageSize` enum: | ||||
|  | ||||
|     PageSize size = Page.Size; | ||||
|  | ||||
|     bool isA4 = size == PageSize.A4; | ||||
| ```cs | ||||
| PageSize size = Page.Size; | ||||
|  | ||||
| bool isA4 = size == PageSize.A4; | ||||
| ``` | ||||
|  | ||||
| `Page` provides access to the text of the page: | ||||
|  | ||||
| @@ -259,6 +348,8 @@ This will return `false` if the document does not contain a form. | ||||
|  | ||||
| The fields can be accessed using the `AcroForm`'s `Fields` property. Since the form is defined at the document level this will return fields from all pages in the document. Fields are of the types defined by the enum `AcroFieldType`, for example `PushButton`, `Checkbox`, `Text`, etc. | ||||
|  | ||||
| Please note the forms are readonly and values cannot be changed or added using PdfPig. | ||||
|  | ||||
| ### Hyperlinks (0.1.0) | ||||
|  | ||||
| A page has a method to extract hyperlinks (annotations of link type): | ||||
| @@ -269,12 +360,15 @@ A page has a method to extract hyperlinks (annotations of link type): | ||||
|  | ||||
| The classes used to work with TrueType fonts in the PDF file are now available for public consumption. Given an input file: | ||||
|  | ||||
|     using UglyToad.PdfPig.Fonts.TrueType; | ||||
|     using UglyToad.PdfPig.Fonts.TrueType.Parser; | ||||
|  | ||||
|     byte[] fontBytes = System.IO.File.ReadAllBytes(@"C:\font.ttf"); | ||||
|     TrueTypeDataBytes input = new TrueTypeDataBytes(fontBytes); | ||||
|     TrueTypeFont font = TrueTypeFontParser.Parse(input); | ||||
| ```cs | ||||
| using UglyToad.PdfPig.Fonts.TrueType; | ||||
| using UglyToad.PdfPig.Fonts.TrueType.Parser; | ||||
|  | ||||
| byte[] fontBytes = System.IO.File.ReadAllBytes(@"C:\font.ttf"); | ||||
| TrueTypeDataBytes input = new TrueTypeDataBytes(fontBytes); | ||||
| TrueTypeFont font = TrueTypeFontParser.Parse(input); | ||||
| ``` | ||||
|  | ||||
| The parsed font can then be inspected. | ||||
|  | ||||
| @@ -282,25 +376,31 @@ The parsed font can then be inspected. | ||||
|  | ||||
| PDF files may contain other files entirely embedded inside them for document annotations. The list of embedded files and their byte content may be accessed: | ||||
|  | ||||
|     if (document.Advanced.TryGetEmbeddedFiles(out IReadOnlyList<EmbeddedFile> files) | ||||
|         && files.Count > 0) | ||||
|     { | ||||
|         var firstFile = files[0]; | ||||
|         string name = firstFile.Name; | ||||
|         IReadOnlyList<byte> bytes = firstFile.Bytes; | ||||
|     } | ||||
| ```cs | ||||
| if (document.Advanced.TryGetEmbeddedFiles(out IReadOnlyList<EmbeddedFile> files) | ||||
|     && files.Count > 0) | ||||
| { | ||||
|     var firstFile = files[0]; | ||||
|     string name = firstFile.Name; | ||||
|     IReadOnlyList<byte> bytes = firstFile.Bytes; | ||||
| } | ||||
| ``` | ||||
|  | ||||
| ### Merging (0.1.2) | ||||
|  | ||||
| You can merge 2 or more existing PDF files using the `PdfMerger` class: | ||||
|  | ||||
|     var resultFileBytes = PdfMerger.Merge(filePath1, filePath2); | ||||
|     File.WriteAllBytes(@"C:\pdfs\outputfilename.pdf", resultFileBytes); | ||||
| ```cs | ||||
| var resultFileBytes = PdfMerger.Merge(filePath1, filePath2); | ||||
| File.WriteAllBytes(@"C:\pdfs\outputfilename.pdf", resultFileBytes); | ||||
| ``` | ||||
|  | ||||
| ## API Reference | ||||
|  | ||||
| If you wish to generate doxygen documentation, run `doxygen doxygen-docs` and open `docs/doxygen/html/index.html`. | ||||
|  | ||||
| See also the [wiki](https://github.com/UglyToad/PdfPig/wiki) for a detailed documentation on parts of the API | ||||
|  | ||||
| ## Issues | ||||
|  | ||||
| Please do file an issue if you encounter a bug. | ||||
|   | ||||
							
								
								
									
										
											BIN
										
									
								
								documentation/DrawingBoundingBoxes.jpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								documentation/DrawingBoundingBoxes.jpg
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 191 KiB | 
							
								
								
									
										
											BIN
										
									
								
								documentation/advancedBlockEditorExample.jpg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								documentation/advancedBlockEditorExample.jpg
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 154 KiB | 
							
								
								
									
										
											BIN
										
									
								
								documentation/boundingBoxes_ReadingOrder.png
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								documentation/boundingBoxes_ReadingOrder.png
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 130 KiB | 
| @@ -0,0 +1,279 @@ | ||||
| namespace UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector | ||||
| { | ||||
|     using System; | ||||
|     using System.Collections.Generic; | ||||
|     using System.Text; | ||||
|     using UglyToad.PdfPig.Core; | ||||
|  | ||||
|     /// <summary> | ||||
|     /// Gets the Thick Boundary Rectangle Relations (TBRR)  | ||||
|     /// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. | ||||
|     /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed | ||||
|     /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para> | ||||
|     /// <para>See also https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para> | ||||
|     /// </summary> | ||||
|     public static class IntervalRelationsHelper | ||||
|     { | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate. | ||||
|         /// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. | ||||
|         /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed | ||||
|         /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para> | ||||
|         /// </summary> | ||||
|         /// <param name="a"></param> | ||||
|         /// <param name="b"></param> | ||||
|         /// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param> | ||||
|         public static IntervalRelations GetRelationX(PdfRectangle a, PdfRectangle b, double T) | ||||
|         { | ||||
|             // Order is important | ||||
|             if (b.Left - T <= a.Left && a.Left <= b.Left + T | ||||
|                 && (b.Right - T <= a.Right && a.Right <= b.Right + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Equals; | ||||
|             } | ||||
|  | ||||
|             if (b.Left - T <= a.Right | ||||
|                 && a.Right <= b.Left + T) | ||||
|             { | ||||
|                 return IntervalRelations.Meets; | ||||
|             } | ||||
|             else if (a.Left - T <= b.Right | ||||
|                 && b.Right <= a.Left + T) | ||||
|             { | ||||
|                 return IntervalRelations.MeetsI; | ||||
|             } | ||||
|  | ||||
|             if (b.Left - T <= a.Left && a.Left <= b.Left + T | ||||
|                 && a.Right < b.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.Starts; | ||||
|             } | ||||
|             else if (a.Left - T <= b.Left && b.Left <= a.Left + T | ||||
|                 && b.Right < a.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.StartsI; | ||||
|             } | ||||
|  | ||||
|             if (a.Left > b.Left + T | ||||
|                 && (b.Right - T <= a.Right && a.Right <= b.Right + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Finishes; | ||||
|             } | ||||
|             else if (b.Left > a.Left + T | ||||
|                 && (a.Right - T <= b.Right && b.Right <= a.Right + T)) | ||||
|             { | ||||
|                 return IntervalRelations.FinishesI; | ||||
|             } | ||||
|  | ||||
|             if (a.Left > b.Left + T | ||||
|                 && a.Right < b.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.During; | ||||
|             } | ||||
|             else if (b.Left > a.Left + T | ||||
|                 && b.Right < a.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.DuringI; | ||||
|             } | ||||
|  | ||||
|             if (a.Left < b.Left - T | ||||
|                 && (b.Left + T < a.Right && a.Right < b.Right - T)) | ||||
|             { | ||||
|                 return IntervalRelations.Overlaps; | ||||
|             } | ||||
|             else if (b.Left < a.Left - T | ||||
|                 && (a.Left + T < b.Right && b.Right < a.Right - T)) | ||||
|             { | ||||
|                 return IntervalRelations.OverlapsI; | ||||
|             } | ||||
|  | ||||
|             if (a.Right < b.Left - T) | ||||
|             { | ||||
|                 return IntervalRelations.Precedes; | ||||
|             } | ||||
|             else if (b.Right < a.Left - T) | ||||
|             { | ||||
|                 return IntervalRelations.PrecedesI; | ||||
|             } | ||||
|  | ||||
|             return IntervalRelations.Unknown; | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate. | ||||
|         /// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. | ||||
|         /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed | ||||
|         /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para> | ||||
|         /// </summary> | ||||
|         /// <param name="a"></param> | ||||
|         /// <param name="b"></param> | ||||
|         /// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param> | ||||
|         public static IntervalRelations GetRelationY(PdfRectangle a, PdfRectangle b, double T) | ||||
|         { | ||||
|             // Order is important | ||||
|             if ((b.Top - T <= a.Top && a.Top <= b.Top + T) | ||||
|                 && (b.Bottom - T <= a.Bottom && a.Bottom <= b.Bottom + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Equals; | ||||
|             } | ||||
|  | ||||
|             if (a.Top - T <= b.Bottom | ||||
|                 && b.Bottom <= a.Top + T) | ||||
|             { | ||||
|                 return IntervalRelations.MeetsI; | ||||
|             } | ||||
|             else if (b.Top - T <= a.Bottom | ||||
|                 && a.Bottom <= b.Top + T) | ||||
|             { | ||||
|                 return IntervalRelations.Meets; | ||||
|             } | ||||
|  | ||||
|             if (b.Top - T <= a.Top && a.Top <= b.Top + T | ||||
|                 && a.Bottom < b.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.StartsI; | ||||
|             } | ||||
|             else if (a.Top - T <= b.Top && b.Top <= a.Top + T | ||||
|                 && b.Bottom < a.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.Starts; | ||||
|             } | ||||
|  | ||||
|             if (a.Top > b.Top + T | ||||
|                 && (b.Bottom - T <= a.Bottom && a.Bottom <= b.Bottom + T)) | ||||
|             { | ||||
|                 return IntervalRelations.FinishesI; | ||||
|             } | ||||
|             else if (b.Top > a.Top + T | ||||
|                 && (a.Bottom - T <= b.Bottom && b.Bottom <= a.Bottom + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Finishes; | ||||
|             } | ||||
|  | ||||
|             if (a.Top > b.Top + T | ||||
|                 && a.Bottom < b.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.DuringI; | ||||
|             } | ||||
|             else if (b.Top > a.Top + T | ||||
|                 && b.Bottom < a.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.During; | ||||
|             } | ||||
|  | ||||
|             if (a.Top < b.Top - T | ||||
|                 && (b.Bottom + T < a.Top && a.Bottom < b.Bottom - T)) | ||||
|             { | ||||
|                 return IntervalRelations.OverlapsI; | ||||
|             } | ||||
|             else if (b.Top < a.Top - T | ||||
|                 && (a.Bottom + T < b.Top && b.Bottom < a.Bottom - T)) | ||||
|             { | ||||
|                 return IntervalRelations.Overlaps; | ||||
|             } | ||||
|  | ||||
|             if (a.Bottom < b.Top - T) | ||||
|             { | ||||
|                 return IntervalRelations.PrecedesI; | ||||
|             } | ||||
|             else if (b.Bottom < a.Top - T) | ||||
|             { | ||||
|                 return IntervalRelations.Precedes; | ||||
|             } | ||||
|  | ||||
|             return IntervalRelations.Unknown; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// <summary> | ||||
|     /// Allen’s interval thirteen relations. | ||||
|     /// <para>See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para> | ||||
|     /// </summary> | ||||
|     public enum IntervalRelations | ||||
|     { | ||||
|         /// <summary> | ||||
|         /// Unknown interval relations. | ||||
|         /// </summary> | ||||
|         Unknown, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// X takes place before Y. | ||||
|         /// <para>|____X____|----------------------</para> | ||||
|         /// <para>----------------------|____Y____|</para> | ||||
|         /// </summary> | ||||
|         Precedes, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// X meets Y. | ||||
|         /// <para>|_____X______|--------------</para> | ||||
|         /// <para>--------------|______Y_____|</para> | ||||
|         /// </summary> | ||||
|         Meets, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// X overlaps with Y. | ||||
|         /// <para>|________X________|-------------</para> | ||||
|         /// <para>-------------|________Y________|</para> | ||||
|         /// </summary> | ||||
|         Overlaps, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// X starts Y. | ||||
|         /// <para>|____X____|-----------------</para> | ||||
|         /// <para>|_______Y_______|-----------</para> | ||||
|         /// </summary> | ||||
|         Starts, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// X during Y. | ||||
|         /// <para>--------|____X____|---------</para> | ||||
|         /// <para>-----|_______Y________|-----</para> | ||||
|         /// </summary> | ||||
|         During, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// X finishes Y. | ||||
|         /// <para>-----------------|____X____|</para> | ||||
|         /// <para>-----------|_______Y_______|</para> | ||||
|         /// </summary> | ||||
|         Finishes, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Inverse precedes. | ||||
|         /// </summary> | ||||
|         PrecedesI, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Inverse meets. | ||||
|         /// </summary> | ||||
|         MeetsI, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Inverse overlaps. | ||||
|         /// </summary> | ||||
|         OverlapsI, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Inverse Starts. | ||||
|         /// </summary> | ||||
|         StartsI, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Inverse during. | ||||
|         /// </summary> | ||||
|         DuringI, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Inverse finishes. | ||||
|         /// </summary> | ||||
|         FinishesI, | ||||
|  | ||||
|         /// <summary> | ||||
|         /// X is equal to Y. | ||||
|         /// <para>----------|____X____|------------</para> | ||||
|         /// <para>----------|____Y____|------------</para> | ||||
|         /// </summary> | ||||
|         Equals | ||||
|     } | ||||
| } | ||||
| @@ -188,8 +188,8 @@ | ||||
|         /// <param name="T">The tolerance parameter T.</param> | ||||
|         private static bool GetBeforeInReading(TextBlock a, TextBlock b, double T) | ||||
|         { | ||||
|             IntervalRelations xRelation = GetIntervalRelationX(a, b, T); | ||||
|             IntervalRelations yRelation = GetIntervalRelationY(a, b, T); | ||||
|             IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T); | ||||
|             IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T); | ||||
|  | ||||
|             return xRelation == IntervalRelations.Precedes || | ||||
|                    yRelation == IntervalRelations.Precedes || | ||||
| @@ -207,8 +207,8 @@ | ||||
|         /// <param name="T">The tolerance parameter T.</param> | ||||
|         private static bool GetBeforeInReadingVertical(TextBlock a, TextBlock b, double T) | ||||
|         { | ||||
|             IntervalRelations xRelation = GetIntervalRelationX(a, b, T); | ||||
|             IntervalRelations yRelation = GetIntervalRelationY(a, b, T); | ||||
|             IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T); | ||||
|             IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T); | ||||
|  | ||||
|             return xRelation == IntervalRelations.Precedes || | ||||
|                 xRelation == IntervalRelations.Meets || | ||||
| @@ -237,8 +237,8 @@ | ||||
|         /// <param name="T">The tolerance parameter T.</param> | ||||
|         private static bool GetBeforeInReadingHorizontal(TextBlock a, TextBlock b, double T) | ||||
|         { | ||||
|             IntervalRelations xRelation = GetIntervalRelationX(a, b, T); | ||||
|             IntervalRelations yRelation = GetIntervalRelationY(a, b, T); | ||||
|             IntervalRelations xRelation = IntervalRelationsHelper.GetRelationX(a.BoundingBox, b.BoundingBox, T); | ||||
|             IntervalRelations yRelation = IntervalRelationsHelper.GetRelationY(a.BoundingBox, b.BoundingBox, T); | ||||
|  | ||||
|             return yRelation == IntervalRelations.Precedes || | ||||
|                    yRelation == IntervalRelations.Meets || | ||||
| @@ -259,263 +259,5 @@ | ||||
|                                                                  yRelation == IntervalRelations.OverlapsI)); | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Gets the Thick Boundary Rectangle Relations (TBRR) for the X coordinate. | ||||
|         /// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. | ||||
|         /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed | ||||
|         /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para> | ||||
|         /// </summary> | ||||
|         /// <param name="a"></param> | ||||
|         /// <param name="b"></param> | ||||
|         /// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param> | ||||
|         private static IntervalRelations GetIntervalRelationX(TextBlock a, TextBlock b, double T) | ||||
|         { | ||||
|             if (a.BoundingBox.Right < b.BoundingBox.Left - T) | ||||
|             { | ||||
|                 return IntervalRelations.Precedes; | ||||
|             } | ||||
|             else if (a.BoundingBox.Right >= b.BoundingBox.Left - T) | ||||
|             { | ||||
|                 return IntervalRelations.PrecedesI; | ||||
|             } | ||||
|  | ||||
|             else if (b.BoundingBox.Left - T <= a.BoundingBox.Right | ||||
|                 && a.BoundingBox.Right <= b.BoundingBox.Left + T) | ||||
|             { | ||||
|                 return IntervalRelations.Meets; | ||||
|             } | ||||
|             else if (b.BoundingBox.Left - T > a.BoundingBox.Right | ||||
|                 && a.BoundingBox.Right > b.BoundingBox.Left + T) | ||||
|             { | ||||
|                 return IntervalRelations.MeetsI; | ||||
|             } | ||||
|  | ||||
|             else if (a.BoundingBox.Left < b.BoundingBox.Left - T | ||||
|                 && (b.BoundingBox.Left + T < a.BoundingBox.Right && a.BoundingBox.Right < b.BoundingBox.Right - T)) | ||||
|             { | ||||
|                 return IntervalRelations.Overlaps; | ||||
|             } | ||||
|             else if (a.BoundingBox.Left >= b.BoundingBox.Left - T | ||||
|                && (b.BoundingBox.Left + T >= a.BoundingBox.Right && a.BoundingBox.Right >= b.BoundingBox.Right - T)) | ||||
|             { | ||||
|                 return IntervalRelations.OverlapsI; | ||||
|             } | ||||
|  | ||||
|             else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T | ||||
|                 && a.BoundingBox.Right < b.BoundingBox.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.Starts; | ||||
|             } | ||||
|             else if (b.BoundingBox.Left - T > a.BoundingBox.Left && a.BoundingBox.Left > b.BoundingBox.Left + T | ||||
|                 && a.BoundingBox.Right >= b.BoundingBox.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.StartsI; | ||||
|             } | ||||
|  | ||||
|             else if (a.BoundingBox.Left > b.BoundingBox.Left + T | ||||
|                 && a.BoundingBox.Right < b.BoundingBox.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.During; | ||||
|             } | ||||
|             else if (a.BoundingBox.Left <= b.BoundingBox.Left + T | ||||
|                 && a.BoundingBox.Right >= b.BoundingBox.Right - T) | ||||
|             { | ||||
|                 return IntervalRelations.DuringI; | ||||
|             } | ||||
|  | ||||
|             else if (a.BoundingBox.Left > b.BoundingBox.Left + T | ||||
|                 && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Finishes; | ||||
|             } | ||||
|             else if (a.BoundingBox.Left <= b.BoundingBox.Left + T | ||||
|                 && (b.BoundingBox.Right - T > a.BoundingBox.Right && a.BoundingBox.Right > b.BoundingBox.Right + T)) | ||||
|             { | ||||
|                 return IntervalRelations.FinishesI; | ||||
|             } | ||||
|  | ||||
|             else if (b.BoundingBox.Left - T <= a.BoundingBox.Left && a.BoundingBox.Left <= b.BoundingBox.Left + T | ||||
|                 && (b.BoundingBox.Right - T <= a.BoundingBox.Right && a.BoundingBox.Right <= b.BoundingBox.Right + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Equals; | ||||
|             } | ||||
|  | ||||
|             return IntervalRelations.Unknown; | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Gets the Thick Boundary Rectangle Relations (TBRR) for the Y coordinate. | ||||
|         /// <para>The Thick Boundary Rectangle Relations (TBRR) is a set of qualitative relations representing the spatial relations of the document objects on the page. | ||||
|         /// For every pair of document objects a and b, one X and one Y interval relation hold. If one considers the pair in reversed | ||||
|         /// order, the inverse interval relation holds. Therefore the directed graph g_i representing these relations is complete.</para> | ||||
|         /// </summary> | ||||
|         /// <param name="a"></param> | ||||
|         /// <param name="b"></param> | ||||
|         /// <param name="T">The tolerance parameter T. If two coordinates are closer than T they are considered equal.</param> | ||||
|         private static IntervalRelations GetIntervalRelationY(TextBlock a, TextBlock b, double T) | ||||
|         { | ||||
|             if (a.BoundingBox.Bottom < b.BoundingBox.Top - T) | ||||
|             { | ||||
|                 return IntervalRelations.PrecedesI; | ||||
|             } | ||||
|             else if (a.BoundingBox.Bottom >= b.BoundingBox.Top - T) | ||||
|             { | ||||
|                 return IntervalRelations.Precedes; | ||||
|             } | ||||
|  | ||||
|             else if (b.BoundingBox.Top - T <= a.BoundingBox.Bottom | ||||
|                 && a.BoundingBox.Bottom <= b.BoundingBox.Top + T) | ||||
|             { | ||||
|                 return IntervalRelations.MeetsI; | ||||
|             } | ||||
|             else if (b.BoundingBox.Top - T > a.BoundingBox.Bottom | ||||
|                 && a.BoundingBox.Bottom > b.BoundingBox.Top + T) | ||||
|             { | ||||
|                 return IntervalRelations.Meets; | ||||
|             } | ||||
|  | ||||
|             else if (a.BoundingBox.Top < b.BoundingBox.Top - T | ||||
|                 && (b.BoundingBox.Top + T < a.BoundingBox.Bottom && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T)) | ||||
|             { | ||||
|                 return IntervalRelations.OverlapsI; | ||||
|             } | ||||
|             else if (a.BoundingBox.Top >= b.BoundingBox.Top - T | ||||
|                && (b.BoundingBox.Top + T >= a.BoundingBox.Bottom && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T)) | ||||
|             { | ||||
|                 return IntervalRelations.Overlaps; | ||||
|             } | ||||
|  | ||||
|             else if (b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T | ||||
|                 && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.StartsI; | ||||
|             } | ||||
|             else if (b.BoundingBox.Top - T > a.BoundingBox.Top && a.BoundingBox.Top > b.BoundingBox.Top + T | ||||
|                 && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.Starts; | ||||
|             } | ||||
|  | ||||
|             else if (a.BoundingBox.Top > b.BoundingBox.Top + T | ||||
|                 && a.BoundingBox.Bottom < b.BoundingBox.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.DuringI; | ||||
|             } | ||||
|             else if (a.BoundingBox.Top <= b.BoundingBox.Top + T | ||||
|                 && a.BoundingBox.Bottom >= b.BoundingBox.Bottom - T) | ||||
|             { | ||||
|                 return IntervalRelations.During; | ||||
|             } | ||||
|  | ||||
|             else if (a.BoundingBox.Top > b.BoundingBox.Top + T | ||||
|                 && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) | ||||
|             { | ||||
|                 return IntervalRelations.FinishesI; | ||||
|             } | ||||
|             else if (a.BoundingBox.Top <= b.BoundingBox.Top + T | ||||
|                 && (b.BoundingBox.Bottom - T > a.BoundingBox.Bottom && a.BoundingBox.Bottom > b.BoundingBox.Bottom + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Finishes; | ||||
|             } | ||||
|  | ||||
|             else if ((b.BoundingBox.Top - T <= a.BoundingBox.Top && a.BoundingBox.Top <= b.BoundingBox.Top + T) | ||||
|                 && (b.BoundingBox.Bottom - T <= a.BoundingBox.Bottom && a.BoundingBox.Bottom <= b.BoundingBox.Bottom + T)) | ||||
|             { | ||||
|                 return IntervalRelations.Equals; | ||||
|             } | ||||
|  | ||||
|             return IntervalRelations.Unknown; | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Allen’s interval thirteen relations. | ||||
|         /// <para>See https://en.wikipedia.org/wiki/Allen%27s_interval_algebra</para> | ||||
|         /// </summary> | ||||
|         private enum IntervalRelations | ||||
|         { | ||||
|             /// <summary> | ||||
|             /// Unknown interval relations. | ||||
|             /// </summary> | ||||
|             Unknown, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// X takes place before Y. | ||||
|             /// <para>|____X____|......................</para> | ||||
|             /// <para>......................|____Y____|</para> | ||||
|             /// </summary> | ||||
|             Precedes, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// X meets Y. | ||||
|             /// <para>|____X____|.................</para> | ||||
|             /// <para>.................|____Y____|</para> | ||||
|             /// </summary> | ||||
|             Meets, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// X overlaps with Y. | ||||
|             /// <para>|______X______|.................</para> | ||||
|             /// <para>.................|______Y______|</para> | ||||
|             /// </summary> | ||||
|             Overlaps, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// X starts Y. | ||||
|             /// <para>|____X____|.................</para> | ||||
|             /// <para>|_____Y_____|..............</para> | ||||
|             /// </summary> | ||||
|             Starts, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// X during Y. | ||||
|             /// <para>........|____X____|.........</para> | ||||
|             /// <para>.....|______Y______|.....</para> | ||||
|             /// </summary> | ||||
|             During, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// X finishes Y. | ||||
|             /// <para>.................|____X____|</para> | ||||
|             /// <para>..............|_____Y_____|</para> | ||||
|             /// </summary> | ||||
|             Finishes, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// Inverse precedes. | ||||
|             /// </summary> | ||||
|             PrecedesI, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// Inverse meets. | ||||
|             /// </summary> | ||||
|             MeetsI, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// Inverse overlaps. | ||||
|             /// </summary> | ||||
|             OverlapsI, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// Inverse Starts. | ||||
|             /// </summary> | ||||
|             StartsI, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// Inverse during. | ||||
|             /// </summary> | ||||
|             DuringI, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// Inverse finishes. | ||||
|             /// </summary> | ||||
|             FinishesI, | ||||
|  | ||||
|             /// <summary> | ||||
|             /// X is equal to Y. | ||||
|             /// <para>..........|____X____|............</para> | ||||
|             /// <para>..........|____Y____|............</para> | ||||
|             /// </summary> | ||||
|             Equals | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
							
								
								
									
										319
									
								
								src/UglyToad.PdfPig.Tests/Dla/IntervalRelationsHelperTests.cs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										319
									
								
								src/UglyToad.PdfPig.Tests/Dla/IntervalRelationsHelperTests.cs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,319 @@ | ||||
| namespace UglyToad.PdfPig.Tests.Dla | ||||
| { | ||||
|     using System.Collections.Generic; | ||||
|     using System.Linq; | ||||
|     using UglyToad.PdfPig.Content; | ||||
|     using UglyToad.PdfPig.DocumentLayoutAnalysis; | ||||
|     using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; | ||||
|     using UglyToad.PdfPig.Core; | ||||
|  | ||||
|     public class IntervalRelationsHelperTests | ||||
|     { | ||||
|         // Note (0,0) is bottom left of page | ||||
|  | ||||
|         /// <summary> | ||||
|         /// A is equal to B. | ||||
|         /// <para>----------|____A____|------------</para> | ||||
|         /// <para>----------|____B____|------------</para> | ||||
|         /// </summary> | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Equals_X() | ||||
|         { | ||||
|             var a = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10)); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Equals, res); | ||||
|         } | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Equals_Y() | ||||
|         { | ||||
|             var a = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10)); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Equals, res); | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// Precedes: A takes place before B. | ||||
|         /// <para>|____A____|----------------------</para> | ||||
|         /// <para>----------------------|____B____|</para> | ||||
|         /// </summary> | ||||
|         ///  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Precedes_X() | ||||
|         { | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(); | ||||
|             var b = PdfPointTestExtensions.BoxAtTopLeft().MoveLeft(100); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Precedes, res); | ||||
|             Assert.Equal(IntervalRelations.PrecedesI, resInverse); | ||||
|         } | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Precedes_Y() | ||||
|         { | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(); | ||||
|             var b = a.MoveDown(200); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Precedes, res); | ||||
|             Assert.Equal(IntervalRelations.PrecedesI, resInverse); | ||||
|         } | ||||
|  | ||||
|  | ||||
|         /// <summary> | ||||
|         /// A meets B. | ||||
|         /// <para>|_____A______|--------------</para> | ||||
|         /// <para>--------------|______B_____|</para> | ||||
|         /// </summary> | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Meets_X() | ||||
|         { | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(100); | ||||
|             var b = a.MoveLeft(100); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Meets, res); | ||||
|             Assert.Equal(IntervalRelations.MeetsI, resInverse); | ||||
|         }         | ||||
|          | ||||
|         /// <summary> | ||||
|         /// A meets B. | ||||
|         /// <para>|_____A______|--------------</para> | ||||
|         /// <para>--------------|______B_____|</para> | ||||
|         /// </summary> | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Meets_X_WithinTolerance() | ||||
|         { | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(100); | ||||
|             var b = a.MoveLeft(110); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, b, 11); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 11); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Meets, res); | ||||
|             Assert.Equal(IntervalRelations.MeetsI, resInverse); | ||||
|         } | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Meets_Y() | ||||
|         { | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(100); | ||||
|             var b = a.MoveDown(100); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Meets, res); | ||||
|             Assert.Equal(IntervalRelations.MeetsI, resInverse); | ||||
|         } | ||||
|  | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Meets_Y_WhenMovedDown_BecomesPreceeds() | ||||
|         { | ||||
|             // We take an A B that meets and move the B further down so becomes preceeds | ||||
|             var startPoint = new PdfPoint(100, 600); | ||||
|             var a = new PdfRectangle(startPoint, startPoint.MoveDown(100)); | ||||
|             var meetsABox = a.MoveDown(100); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, meetsABox, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationY(meetsABox, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Meets, res); | ||||
|             Assert.Equal(IntervalRelations.MeetsI, resInverse); | ||||
|              | ||||
|             var preceededByABox = meetsABox.MoveDown(100); | ||||
|  | ||||
|  | ||||
|             var moveRes = IntervalRelationsHelper.GetRelationY(a, preceededByABox, 5); | ||||
|             var moveResInverse = IntervalRelationsHelper.GetRelationY(preceededByABox, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Precedes, moveRes); | ||||
|             Assert.Equal(IntervalRelations.PrecedesI, moveResInverse); | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// A overlaps with B. | ||||
|         /// <para>|________A________|-------------</para> | ||||
|         /// <para>-------------|________B________|</para> | ||||
|         /// </summary> | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Overlaps_X() | ||||
|         { | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(100); | ||||
|             var b = a.MoveLeft(a.Width/2); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Overlaps, res); | ||||
|             Assert.Equal(IntervalRelations.OverlapsI, resInverse); | ||||
|         } | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Overlaps_Y() | ||||
|         { | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(100); | ||||
|             var b = a.MoveLeft(500).MoveDown(a.Height / 2); // Only the move down is important | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Overlaps, res); | ||||
|             Assert.Equal(IntervalRelations.OverlapsI, resInverse); | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// A starts B. | ||||
|         /// <para>|____A____|-----------------</para> | ||||
|         /// <para>|_______B_______|-----------</para> | ||||
|         /// </summary> | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Starts_X() | ||||
|         { | ||||
|             var topLeft = PdfPointTestExtensions.OriginTopLeft(); | ||||
|             var a = new PdfRectangle(topLeft, topLeft.MoveLeft(50).MoveDown(10)); | ||||
|             var b = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(10)); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Starts, res); | ||||
|             Assert.Equal(IntervalRelations.StartsI, resInverse); | ||||
|         } | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Starts_Y() | ||||
|         { | ||||
|             var topLeft = PdfPointTestExtensions.OriginTopLeft(); | ||||
|             var a = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(100)); | ||||
|             var b = new PdfRectangle(topLeft, topLeft.MoveLeft(100).MoveDown(200)); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Starts, res); | ||||
|             Assert.Equal(IntervalRelations.StartsI, resInverse); | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// A during B. | ||||
|         /// <para>--------|____A____|---------</para> | ||||
|         /// <para>-----|_______B________|-----</para> | ||||
|         /// </summary> | ||||
|         ///During, | ||||
|         [Fact] | ||||
|         public void IntervalRelation_During_X() | ||||
|         { | ||||
|             var a = new PdfRectangle(new PdfPoint(20, 0), new PdfPoint(80, 0)); | ||||
|             var b = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(100, 0)); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.During, res); | ||||
|             Assert.Equal(IntervalRelations.DuringI, resInverse); | ||||
|         } | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_During_Y() | ||||
|         { | ||||
|             var a = new PdfRectangle(new PdfPoint(0, 20), new PdfPoint(0, 80)); | ||||
|             var b = new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(0, 100)); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.During, res); | ||||
|             Assert.Equal(IntervalRelations.DuringI, resInverse); | ||||
|         } | ||||
|  | ||||
|         /// <summary> | ||||
|         /// A finishes B. | ||||
|         /// <para>-----------------|____A____|</para> | ||||
|         /// <para>-----------|_______B_______|</para> | ||||
|         /// </summary> | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Finishes_X() | ||||
|         { | ||||
|             var topRight = PdfPointTestExtensions.OriginTopLeft().MoveLeft(400); | ||||
|             var a = new PdfRectangle(topRight.MoveX(-100), topRight); | ||||
|             var b = new PdfRectangle(topRight.MoveX(-200), topRight); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationX(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationX(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Finishes, res); | ||||
|             Assert.Equal(IntervalRelations.FinishesI, resInverse); | ||||
|         } | ||||
|  | ||||
|         [Fact] | ||||
|         public void IntervalRelation_Finishes_Y() | ||||
|         { | ||||
|             var topleft = PdfPointTestExtensions.OriginTopLeft(); | ||||
|             var a = PdfPointTestExtensions.BoxAtTopLeft(20).MoveDown(20); | ||||
|             var b = PdfPointTestExtensions.BoxAtTopLeft(40); | ||||
|  | ||||
|             var res = IntervalRelationsHelper.GetRelationY(a, b, 5); | ||||
|             var resInverse = IntervalRelationsHelper.GetRelationY(b, a, 5); | ||||
|  | ||||
|             Assert.Equal(IntervalRelations.Finishes, res); | ||||
|             Assert.Equal(IntervalRelations.FinishesI, resInverse); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     internal static class PdfPointTestExtensions | ||||
|     { | ||||
|  | ||||
|         internal static PdfPoint OriginTopLeft() | ||||
|         { | ||||
|             return new PdfPoint(0, 800); | ||||
|         } | ||||
|  | ||||
|         internal static PdfPoint MoveLeft(this PdfPoint it, double dist) | ||||
|         { | ||||
|             if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive"); | ||||
|  | ||||
|             return it.MoveX(dist); | ||||
|         } | ||||
|         internal static PdfPoint MoveDown(this PdfPoint it, double dist) | ||||
|         { | ||||
|             if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive"); | ||||
|  | ||||
|             return it.MoveY(-dist); | ||||
|         } | ||||
|  | ||||
|         internal static PdfRectangle BoxAtTopLeft(double length = 10d) | ||||
|         { | ||||
|             return new PdfRectangle(OriginTopLeft(), OriginTopLeft().MoveLeft(length).MoveDown(length)); | ||||
|         } | ||||
|  | ||||
|  | ||||
|         internal static PdfRectangle MoveLeft(this PdfRectangle start, double dist) | ||||
|         { | ||||
|             if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive"); | ||||
|  | ||||
|             return new PdfRectangle(start.BottomLeft.MoveLeft(dist), start.TopRight.MoveLeft(dist)); | ||||
|         } | ||||
|  | ||||
|  | ||||
|  | ||||
|         internal static PdfRectangle MoveDown(this PdfRectangle start, double dist) | ||||
|         { | ||||
|             if (dist < 0) throw new ArgumentException(nameof(dist) + "must be positive"); | ||||
|  | ||||
|             return new PdfRectangle(start.BottomLeft.MoveDown(dist), start.TopRight.MoveDown(dist)); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -0,0 +1,73 @@ | ||||
| namespace UglyToad.PdfPig.Tests.Dla | ||||
| { | ||||
|     using System; | ||||
|     using System.Collections.Generic; | ||||
|     using System.Linq; | ||||
|     using System.Text; | ||||
|     using System.Threading.Tasks; | ||||
|     using UglyToad.PdfPig.Content; | ||||
|     using UglyToad.PdfPig.DocumentLayoutAnalysis; | ||||
|     using UglyToad.PdfPig.DocumentLayoutAnalysis.ReadingOrderDetector; | ||||
|     using UglyToad.PdfPig.Core; | ||||
|  | ||||
|     public class UnsupervisedReadingOrderTests | ||||
|     { | ||||
|         [Fact] | ||||
|         public void ReadingOrderOrdersItemsOnTheSameRowContents() | ||||
|         { | ||||
|             TextBlock leftTextBlock = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(0, 0), new PdfPoint(10, 10))); | ||||
|             TextBlock rightTextBlock = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(100, 0), new PdfPoint(110, 10))); | ||||
|  | ||||
|             // We deliberately submit in the wrong order | ||||
|             var textBlocks = new List<TextBlock>() { rightTextBlock, leftTextBlock }; | ||||
|  | ||||
|             var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, UnsupervisedReadingOrderDetector.SpatialReasoningRules.RowWise); | ||||
|             var orderedBlocks = unsupervisedReadingOrderDetector.Get(textBlocks); | ||||
|  | ||||
|             var ordered = orderedBlocks.OrderBy(x => x.ReadingOrder).ToList(); | ||||
|             Assert.Equal(0, ordered[0].BoundingBox.Left); | ||||
|             Assert.Equal(100, ordered[1].BoundingBox.Left); | ||||
|         } | ||||
|  | ||||
|  | ||||
|         [Fact] | ||||
|         public void DocumentTest() | ||||
|         { | ||||
|             var title = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 709.06), new PdfPoint(x: 42.6, y: 709.06))); | ||||
|             var line1_Left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 668.86), new PdfPoint(x: 42.6, y: 668.86))); | ||||
|             var line1_Right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 668.86), new PdfPoint(x: 302.21, y: 668.86))); | ||||
|             var line2_Left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 608.26), new PdfPoint(x: 42.6, y: 608.26))); | ||||
|             var line2_Taller_Right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 581.35), new PdfPoint(x: 302.21, y: 581.35))); | ||||
|             var line3 = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 515.83), new PdfPoint(x: 42.6, y: 515.83))); | ||||
|             var line4_left = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 42.6, y: 490.27), new PdfPoint(x: 42.6, y: 490.27))); | ||||
|             var line4_right = CreateFakeTextBlock(new PdfRectangle(new PdfPoint(x: 302.21, y: 491.59), new PdfPoint(x: 302.21, y: 491.59))); | ||||
|  | ||||
|             // We deliberately submit in the wrong order | ||||
|             var textBlocks = new List<TextBlock>() { title, line4_left, line2_Taller_Right, line4_right, line1_Right, line1_Left, line3, line2_Left }; | ||||
|  | ||||
|             var unsupervisedReadingOrderDetector = new UnsupervisedReadingOrderDetector(5, UnsupervisedReadingOrderDetector.SpatialReasoningRules.RowWise); | ||||
|             var orderedBlocks = unsupervisedReadingOrderDetector.Get(textBlocks); | ||||
|  | ||||
|             var ordered = orderedBlocks.OrderBy(x => x.ReadingOrder).ToList(); | ||||
|             Assert.Equal(title.BoundingBox, ordered[0].BoundingBox); | ||||
|             Assert.Equal(line1_Left.BoundingBox, ordered[1].BoundingBox); | ||||
|             Assert.Equal(line1_Right.BoundingBox, ordered[2].BoundingBox); | ||||
|             Assert.Equal(line2_Left.BoundingBox, ordered[3].BoundingBox); | ||||
|             Assert.Equal(line2_Taller_Right.BoundingBox, ordered[4].BoundingBox); | ||||
|             Assert.Equal(line3.BoundingBox, ordered[5].BoundingBox); | ||||
|             Assert.Equal(line4_left.BoundingBox, ordered[6].BoundingBox); | ||||
|             Assert.Equal(line4_right.BoundingBox, ordered[7].BoundingBox); | ||||
|         } | ||||
|  | ||||
|         private static TextBlock CreateFakeTextBlock(PdfRectangle boundingBox) | ||||
|         { | ||||
|             var letter = new Letter("a", | ||||
|                 boundingBox, | ||||
|                 boundingBox.BottomLeft, | ||||
|                 boundingBox.BottomRight, | ||||
|                 10, 1, null, TextRenderingMode.NeitherClip, null, null, 0, 0);// These don't matter | ||||
|             var leftTextBlock = new TextBlock(new[] { new TextLine(new[] { new Word(new[] { letter }) }) }); | ||||
|             return leftTextBlock; | ||||
|         } | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user
	 davebrokit
					davebrokit