#434 ensure companion stream is added to cross reference on building

This commit is contained in:
Eliot Jones 2022-04-02 15:58:22 -04:00
parent abcbdc55e3
commit 2fd46571b3
6 changed files with 82 additions and 51 deletions

View File

@ -1,39 +1,39 @@
namespace UglyToad.PdfPig.Tests.Integration
{
//using System;
//using System.Diagnostics;
//using System.IO;
//using Xunit;
using System;
using System.Diagnostics;
using System.IO;
using Xunit;
/// <summary>
/// A class for testing files which are not checked in to source control.
/// </summary>
public class LocalTests
{
//[Fact]
//public void Tests()
//{
// var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf");
[Fact]
public void Tests()
{
var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf");
// foreach (var file in files)
// {
// try
// {
// using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false }))
// {
// for (var i = 1; i <= document.NumberOfPages; i++)
// {
// var page = document.GetPage(i);
// var text = page.Text;
// Trace.WriteLine(text);
// }
// }
// }
// catch (Exception ex)
// {
// throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex);
// }
// }
//}
foreach (var file in files)
{
try
{
using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false }))
{
for (var i = 1; i <= document.NumberOfPages; i++)
{
var page = document.GetPage(i);
var text = page.Text;
Trace.WriteLine(text);
}
}
}
catch (Exception ex)
{
throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex);
}
}
}
}
}

View File

@ -34,28 +34,35 @@
DictionaryToken trailerDictionary = new DictionaryToken(new Dictionary<NameToken, IToken>());
Dictionary<IndirectReference, long> objectOffsets = new Dictionary<IndirectReference, long>();
List<long> xrefSeqBytePos = new List<long>();
var xrefPartToBytePositionOrder = new List<long>();
var currentPart = parts.FirstOrDefault(x => x.Offset == firstCrossReferenceOffset);
if (currentPart == null)
{
// no XRef at given position
log.Warn("Did not found XRef object at specified startxref position " + firstCrossReferenceOffset);
log.Warn($"Did not find an XRef object at the specified startxref position {firstCrossReferenceOffset}");
// use all objects in byte position order (last entries overwrite previous ones)
xrefSeqBytePos.AddRange(parts.Select(x => x.Offset));
xrefSeqBytePos.Sort();
xrefPartToBytePositionOrder.AddRange(parts.Select(x => x.Offset));
xrefPartToBytePositionOrder.Sort();
}
else
{
// copy xref type
type = currentPart.Type;
// found starting Xref object
// add this and follow chain defined by 'Prev' keys
xrefSeqBytePos.Add(firstCrossReferenceOffset);
xrefPartToBytePositionOrder.Add(firstCrossReferenceOffset);
// Get any streams that are tied to this table.
var activePart = currentPart;
var dependents = parts.Where(x => x.TiedToXrefAtOffset == activePart.Offset);
foreach (var dependent in dependents)
{
xrefPartToBytePositionOrder.Add(dependent.Offset);
}
while (currentPart.Dictionary != null)
{
@ -72,21 +79,21 @@
break;
}
xrefSeqBytePos.Add(prevBytePos);
xrefPartToBytePositionOrder.Add(prevBytePos);
// sanity check to prevent infinite loops
if (xrefSeqBytePos.Count >= parts.Count)
if (xrefPartToBytePositionOrder.Count >= parts.Count)
{
break;
}
}
// have to reverse order so that later XRefs will overwrite previous ones
xrefSeqBytePos.Reverse();
xrefPartToBytePositionOrder.Reverse();
}
// merge used and sorted XRef/trailer
foreach (long bPos in xrefSeqBytePos)
foreach (long bPos in xrefPartToBytePositionOrder)
{
var currentObject = parts.First(x => x.Offset == bPos || x.Offset == bPos + offsetCorrection);
if (currentObject.Dictionary != null)

View File

@ -33,13 +33,24 @@
public CrossReferenceType Type { get; }
public CrossReferenceTablePart(IReadOnlyDictionary<IndirectReference, long> objectOffsets, long offset, long previous, DictionaryToken dictionary, CrossReferenceType type)
/// <summary>
/// For Xref streams indicated by tables they should be used together when constructing the final table.
/// </summary>
public long? TiedToXrefAtOffset { get; }
public CrossReferenceTablePart(
IReadOnlyDictionary<IndirectReference, long> objectOffsets,
long offset, long previous,
DictionaryToken dictionary,
CrossReferenceType type,
long? tiedToXrefAtOffset)
{
ObjectOffsets = objectOffsets;
Offset = offset;
Previous = previous;
Dictionary = dictionary;
Type = type;
TiedToXrefAtOffset = tiedToXrefAtOffset;
}
public void FixOffset(long offset)

View File

@ -15,7 +15,9 @@
public DictionaryToken Dictionary { get; set; }
public CrossReferenceType XRefType { get; set; }
public long? TiedToPreviousAtOffset { get; set; }
public void Add(long objectId, int generationNumber, long offset)
{
IndirectReference objKey = new IndirectReference(objectId, generationNumber);
@ -28,7 +30,7 @@
public CrossReferenceTablePart Build()
{
return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType);
return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType, TiedToPreviousAtOffset);
}
}
}

View File

@ -82,9 +82,11 @@
// check for a XRef stream, it may contain some object ids of compressed objects
if (tableDictionary.ContainsKey(NameToken.XrefStm))
{
log.Debug("Cross reference table contained referenced to stream. Reading the stream.");
log.Debug("Cross reference table contained reference to stream. Reading the stream.");
int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int;
var tiedToTableAtOffset = tablePart.Offset;
int streamOffset = ((NumericToken) tableDictionary.Data[NameToken.XrefStm]).Int;
// check the xref stream reference
fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing);
@ -96,8 +98,13 @@
// Update the cross reference table to be a stream instead.
tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset));
tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset,
tablePart.Previous, tableDictionary, tablePart.Type);
tablePart = new CrossReferenceTablePart(
tablePart.ObjectOffsets,
streamOffset,
tablePart.Previous,
tableDictionary,
tablePart.Type,
tiedToTableAtOffset);
}
// Read the stream from the table.
@ -105,7 +112,7 @@
{
try
{
TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart);
TryParseCrossReferenceStream(streamOffset, pdfScanner, tiedToTableAtOffset, out streamPart);
}
catch (InvalidOperationException ex)
{
@ -149,7 +156,7 @@
tokenScanner.Seek(previousCrossReferenceLocation);
// parse xref stream
if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, out var tablePart))
if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, null, out var tablePart))
{
if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset))
{
@ -218,7 +225,10 @@
return resolved;
}
private bool TryParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner,
private bool TryParseCrossReferenceStream(
long objByteOffset,
IPdfTokenScanner pdfScanner,
long? fromTableAtOffset,
out CrossReferenceTablePart xrefTablePart)
{
xrefTablePart = null;
@ -236,7 +246,7 @@
return false;
}
xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream);
xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, fromTableAtOffset, objectStream);
return true;
}

View File

@ -19,7 +19,7 @@
/// <summary>
/// Parses through the unfiltered stream and populates the xrefTable HashMap.
/// </summary>
public CrossReferenceTablePart Parse(long streamOffset, StreamToken stream)
public CrossReferenceTablePart Parse(long streamOffset, long? fromTableAtOffset, StreamToken stream)
{
var decoded = stream.Decode(filterProvider);
@ -38,7 +38,8 @@
Offset = streamOffset,
Previous = previousOffset,
Dictionary = stream.StreamDictionary,
XRefType = CrossReferenceType.Stream
XRefType = CrossReferenceType.Stream,
TiedToPreviousAtOffset = fromTableAtOffset
};
var objectNumbers = GetObjectNumbers(stream.StreamDictionary);