amend default word extractor logic #603

default word extractor consumed words sorted
by descending y value order and had a check for
when the following letter had a value more than 0.5
units different to the current baseline (from first letter)
position. however we were checking if the new value was
more than the current baseline which it could never
be since the letter was always guaranteed to have an
equal or lower y value based on initial sort (since pdf
y dimension runs top to bottom)
This commit is contained in:
Eliot Jones 2023-05-22 20:22:19 +01:00
parent 98be67d93b
commit 471dca8327
3 changed files with 89 additions and 26 deletions

View File

@ -1,39 +1,34 @@
using System;
using System.IO;
using UglyToad.PdfPig.Core;
using Xunit;
namespace UglyToad.PdfPig.Tests.Fonts.Type1
namespace UglyToad.PdfPig.Tests.Fonts.Type1
{
using UglyToad.PdfPig.Core;
using Xunit;
using Integration;
public class Type1CharStringParserTests
{
[Fact]
public void CorrectBoundingBoxesFlexPoints()
{
PointComparer pointComparer = new PointComparer(new DoubleComparer(3));
var pointComparer = new PointComparer(new DoubleComparer(3));
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
var filePath = IntegrationHelpers.GetDocumentPath("data.pdf");
var filePath = Path.Combine(documentFolder, "data.pdf");
using var doc = PdfDocument.Open(filePath);
var page = doc.GetPage(1);
using (var doc = PdfDocument.Open(filePath))
{
var page = doc.GetPage(1);
var letters = page.Letters;
var letters = page.Letters;
// check 'm'
var m = letters[0];
Assert.Equal("m", m.Value);
Assert.Equal(new PdfPoint(253.4458, 658.431), m.GlyphRectangle.BottomLeft, pointComparer);
Assert.Equal(new PdfPoint(261.22659, 662.83446), m.GlyphRectangle.TopRight, pointComparer);
// check 'm'
var m = letters[0];
Assert.Equal("m", m.Value);
Assert.Equal(new PdfPoint(253.4458, 658.431), m.GlyphRectangle.BottomLeft, pointComparer);
Assert.Equal(new PdfPoint(261.22659, 662.83446), m.GlyphRectangle.TopRight, pointComparer);
// check 'p'
var p = letters[1];
Assert.Equal("p", p.Value);
Assert.Equal(new PdfPoint(261.70778, 656.49825), p.GlyphRectangle.BottomLeft, pointComparer);
Assert.Equal(new PdfPoint(266.6193, 662.83446), p.GlyphRectangle.TopRight, pointComparer);
}
// check 'p'
var p = letters[1];
Assert.Equal("p", p.Value);
Assert.Equal(new PdfPoint(261.70778, 656.49825), p.GlyphRectangle.BottomLeft, pointComparer);
Assert.Equal(new PdfPoint(266.6193, 662.83446), p.GlyphRectangle.TopRight, pointComparer);
}
}
}

View File

@ -0,0 +1,68 @@
namespace UglyToad.PdfPig.Tests.Util
{
using Integration;
using System.Linq;
using Xunit;
public class DefaultWordExtractorTests
{
[Fact]
public void ReadWordsFromDataPdfPage3()
{
var file = IntegrationHelpers.GetDocumentPath("data.pdf");
using var pdf = PdfDocument.Open(file);
var page = pdf.GetPage(3);
var words = page.GetWords();
var text = string.Join(" ", words.Select(x => x.Text));
Assert.Equal(
"len supp dose 4.2 VC 0.5 11.5 VC 0.5 7.3 VC 0.5 5.8 VC 0.5 6.4 VC 0.5 10.0 VC 0.5 11.2 VC 0.5 11.2 VC 0.5 5.2 VC 0.5 7.0 VC 0.5" +
" 16.5 VC 1.0 16.5 VC 1.0 15.2 VC 1.0 17.3 VC 1.0 22.5 VC 1.0 3",
text);
}
[Fact]
public void ReadWordsFromOldGutnishPage1()
{
var file = IntegrationHelpers.GetDocumentPath("Old Gutnish Internet Explorer.pdf");
using var pdf = PdfDocument.Open(file);
var page = pdf.GetPage(1);
var words = page.GetWords();
var text = string.Join(" ", words.Select(x => x.Text));
Assert.StartsWith(
"Old Gutnish - Wikipedia Page 1 of 3 Old Gutnish Old Gutnish was the dialect of Old Norse that was spoken on the Baltic island of Gotland." +
" It shows sufficient differences from the Old West Norse and Old East Norse dialects that it is considered to be a separate branch." +
" Gutnish is still spoken in some parts of Gotland and on the adjoining island of Fårö.",
text);
}
[Fact]
public void ReadWordsFromTikka1552Page8()
{
var file = IntegrationHelpers.GetDocumentPath("TIKA-1552-0.pdf");
using var pdf = PdfDocument.Open(file);
var page = pdf.GetPage(8);
var words = page.GetWords();
var text = string.Join(" ", words.Select(x => x.Text));
Assert.StartsWith(
"2 THE BUDGET MESSAGE OF THE PRESIDENT Administrations SelectUSA initiative to help draw businesses and investment from around the world to our shores." +
" If we want to make the best products, we also have to invest in the best ideas. That is why the Budget maintains a world-class commitment to science and research," +
" targeting resources to those areas most likely to contribute directly to the creation of transformational technologies that can create the businesses and jobs of the future.",
text);
}
}
}

View File

@ -50,7 +50,7 @@
continue;
}
if (letter.Location.Y > y.Value + 0.5)
if (letter.Location.Y < y.Value - 0.5)
{
if (lettersSoFar.Count > 0)
{