mirror of
https://github.com/UglyToad/PdfPig.git
synced 2025-04-05 20:55:01 +08:00
amend default word extractor logic #603
default word extractor consumed words sorted by descending y value order and had a check for when the following letter had a value more than 0.5 units different to the current baseline (from first letter) position. however we were checking if the new value was more than the current baseline which it could never be since the letter was always guaranteed to have an equal or lower y value based on initial sort (since pdf y dimension runs top to bottom)
This commit is contained in:
parent
98be67d93b
commit
471dca8327
@ -1,39 +1,34 @@
|
||||
using System;
|
||||
using System.IO;
|
||||
using UglyToad.PdfPig.Core;
|
||||
using Xunit;
|
||||
|
||||
namespace UglyToad.PdfPig.Tests.Fonts.Type1
|
||||
namespace UglyToad.PdfPig.Tests.Fonts.Type1
|
||||
{
|
||||
using UglyToad.PdfPig.Core;
|
||||
using Xunit;
|
||||
using Integration;
|
||||
|
||||
public class Type1CharStringParserTests
|
||||
{
|
||||
[Fact]
|
||||
public void CorrectBoundingBoxesFlexPoints()
|
||||
{
|
||||
PointComparer pointComparer = new PointComparer(new DoubleComparer(3));
|
||||
var pointComparer = new PointComparer(new DoubleComparer(3));
|
||||
|
||||
var documentFolder = Path.GetFullPath(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "..", "..", "..", "Integration", "Documents"));
|
||||
var filePath = IntegrationHelpers.GetDocumentPath("data.pdf");
|
||||
|
||||
var filePath = Path.Combine(documentFolder, "data.pdf");
|
||||
using var doc = PdfDocument.Open(filePath);
|
||||
var page = doc.GetPage(1);
|
||||
|
||||
using (var doc = PdfDocument.Open(filePath))
|
||||
{
|
||||
var page = doc.GetPage(1);
|
||||
var letters = page.Letters;
|
||||
|
||||
var letters = page.Letters;
|
||||
// check 'm'
|
||||
var m = letters[0];
|
||||
Assert.Equal("m", m.Value);
|
||||
Assert.Equal(new PdfPoint(253.4458, 658.431), m.GlyphRectangle.BottomLeft, pointComparer);
|
||||
Assert.Equal(new PdfPoint(261.22659, 662.83446), m.GlyphRectangle.TopRight, pointComparer);
|
||||
|
||||
// check 'm'
|
||||
var m = letters[0];
|
||||
Assert.Equal("m", m.Value);
|
||||
Assert.Equal(new PdfPoint(253.4458, 658.431), m.GlyphRectangle.BottomLeft, pointComparer);
|
||||
Assert.Equal(new PdfPoint(261.22659, 662.83446), m.GlyphRectangle.TopRight, pointComparer);
|
||||
|
||||
// check 'p'
|
||||
var p = letters[1];
|
||||
Assert.Equal("p", p.Value);
|
||||
Assert.Equal(new PdfPoint(261.70778, 656.49825), p.GlyphRectangle.BottomLeft, pointComparer);
|
||||
Assert.Equal(new PdfPoint(266.6193, 662.83446), p.GlyphRectangle.TopRight, pointComparer);
|
||||
}
|
||||
// check 'p'
|
||||
var p = letters[1];
|
||||
Assert.Equal("p", p.Value);
|
||||
Assert.Equal(new PdfPoint(261.70778, 656.49825), p.GlyphRectangle.BottomLeft, pointComparer);
|
||||
Assert.Equal(new PdfPoint(266.6193, 662.83446), p.GlyphRectangle.TopRight, pointComparer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
68
src/UglyToad.PdfPig.Tests/Util/DefaultWordExtractorTests.cs
Normal file
68
src/UglyToad.PdfPig.Tests/Util/DefaultWordExtractorTests.cs
Normal file
@ -0,0 +1,68 @@
|
||||
namespace UglyToad.PdfPig.Tests.Util
|
||||
{
|
||||
using Integration;
|
||||
using System.Linq;
|
||||
using Xunit;
|
||||
|
||||
public class DefaultWordExtractorTests
|
||||
{
|
||||
[Fact]
|
||||
public void ReadWordsFromDataPdfPage3()
|
||||
{
|
||||
var file = IntegrationHelpers.GetDocumentPath("data.pdf");
|
||||
|
||||
using var pdf = PdfDocument.Open(file);
|
||||
|
||||
var page = pdf.GetPage(3);
|
||||
|
||||
var words = page.GetWords();
|
||||
|
||||
var text = string.Join(" ", words.Select(x => x.Text));
|
||||
|
||||
Assert.Equal(
|
||||
"len supp dose 4.2 VC 0.5 11.5 VC 0.5 7.3 VC 0.5 5.8 VC 0.5 6.4 VC 0.5 10.0 VC 0.5 11.2 VC 0.5 11.2 VC 0.5 5.2 VC 0.5 7.0 VC 0.5" +
|
||||
" 16.5 VC 1.0 16.5 VC 1.0 15.2 VC 1.0 17.3 VC 1.0 22.5 VC 1.0 3",
|
||||
text);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadWordsFromOldGutnishPage1()
|
||||
{
|
||||
var file = IntegrationHelpers.GetDocumentPath("Old Gutnish Internet Explorer.pdf");
|
||||
|
||||
using var pdf = PdfDocument.Open(file);
|
||||
|
||||
var page = pdf.GetPage(1);
|
||||
|
||||
var words = page.GetWords();
|
||||
|
||||
var text = string.Join(" ", words.Select(x => x.Text));
|
||||
|
||||
Assert.StartsWith(
|
||||
"Old Gutnish - Wikipedia Page 1 of 3 Old Gutnish Old Gutnish was the dialect of Old Norse that was spoken on the Baltic island of Gotland." +
|
||||
" It shows sufficient differences from the Old West Norse and Old East Norse dialects that it is considered to be a separate branch." +
|
||||
" Gutnish is still spoken in some parts of Gotland and on the adjoining island of Fårö.",
|
||||
text);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void ReadWordsFromTikka1552Page8()
|
||||
{
|
||||
var file = IntegrationHelpers.GetDocumentPath("TIKA-1552-0.pdf");
|
||||
|
||||
using var pdf = PdfDocument.Open(file);
|
||||
|
||||
var page = pdf.GetPage(8);
|
||||
|
||||
var words = page.GetWords();
|
||||
|
||||
var text = string.Join(" ", words.Select(x => x.Text));
|
||||
|
||||
Assert.StartsWith(
|
||||
"2 THE BUDGET MESSAGE OF THE PRESIDENT Administration’s SelectUSA initiative to help draw businesses and investment from around the world to our shores." +
|
||||
" If we want to make the best products, we also have to invest in the best ideas. That is why the Budget maintains a world-class commitment to science and research," +
|
||||
" targeting resources to those areas most likely to contribute directly to the creation of transformational technologies that can create the businesses and jobs of the future.",
|
||||
text);
|
||||
}
|
||||
}
|
||||
}
|
@ -50,7 +50,7 @@
|
||||
continue;
|
||||
}
|
||||
|
||||
if (letter.Location.Y > y.Value + 0.5)
|
||||
if (letter.Location.Y < y.Value - 0.5)
|
||||
{
|
||||
if (lettersSoFar.Count > 0)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user