From 2fd46571b3f199e0ffd15e0b1ca73aac365035c2 Mon Sep 17 00:00:00 2001 From: Eliot Jones Date: Sat, 2 Apr 2022 15:58:22 -0400 Subject: [PATCH] #434 ensure companion stream is added to cross reference on building --- .../Integration/LocalTests.cs | 56 +++++++++---------- .../CrossReferenceTableBuilder.cs | 27 +++++---- .../CrossReference/CrossReferenceTablePart.cs | 13 ++++- .../CrossReferenceTablePartBuilder.cs | 6 +- .../FileStructure/CrossReferenceParser.cs | 26 ++++++--- .../CrossReferenceStreamParser.cs | 5 +- 6 files changed, 82 insertions(+), 51 deletions(-) diff --git a/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs b/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs index 94545f2f..253fc776 100644 --- a/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs +++ b/src/UglyToad.PdfPig.Tests/Integration/LocalTests.cs @@ -1,39 +1,39 @@ namespace UglyToad.PdfPig.Tests.Integration { - //using System; - //using System.Diagnostics; - //using System.IO; - //using Xunit; + using System; + using System.Diagnostics; + using System.IO; + using Xunit; /// /// A class for testing files which are not checked in to source control. /// public class LocalTests { - //[Fact] - //public void Tests() - //{ - // var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf"); + [Fact] + public void Tests() + { + var files = Directory.GetFiles(@"C:\temp\pdfs", "*.pdf"); - // foreach (var file in files) - // { - // try - // { - // using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false })) - // { - // for (var i = 1; i <= document.NumberOfPages; i++) - // { - // var page = document.GetPage(i); - // var text = page.Text; - // Trace.WriteLine(text); - // } - // } - // } - // catch (Exception ex) - // { - // throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex); - // } - // } - //} + foreach (var file in files) + { + try + { + using (var document = PdfDocument.Open(file, new ParsingOptions { UseLenientParsing = false })) + { + for (var i = 1; i <= document.NumberOfPages; i++) + { + var page = document.GetPage(i); + var text = page.Text; + Trace.WriteLine(text); + } + } + } + catch (Exception ex) + { + throw new InvalidOperationException($"Error parsing: {Path.GetFileName(file)}.", ex); + } + } + } } } diff --git a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs index dc43a446..8b5c2a62 100644 --- a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs +++ b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTableBuilder.cs @@ -34,28 +34,35 @@ DictionaryToken trailerDictionary = new DictionaryToken(new Dictionary()); Dictionary objectOffsets = new Dictionary(); - List xrefSeqBytePos = new List(); + var xrefPartToBytePositionOrder = new List(); var currentPart = parts.FirstOrDefault(x => x.Offset == firstCrossReferenceOffset); if (currentPart == null) { // no XRef at given position - log.Warn("Did not found XRef object at specified startxref position " + firstCrossReferenceOffset); + log.Warn($"Did not find an XRef object at the specified startxref position {firstCrossReferenceOffset}"); // use all objects in byte position order (last entries overwrite previous ones) - xrefSeqBytePos.AddRange(parts.Select(x => x.Offset)); - xrefSeqBytePos.Sort(); + xrefPartToBytePositionOrder.AddRange(parts.Select(x => x.Offset)); + xrefPartToBytePositionOrder.Sort(); } else { // copy xref type type = currentPart.Type; - // found starting Xref object // add this and follow chain defined by 'Prev' keys - xrefSeqBytePos.Add(firstCrossReferenceOffset); + xrefPartToBytePositionOrder.Add(firstCrossReferenceOffset); + + // Get any streams that are tied to this table. + var activePart = currentPart; + var dependents = parts.Where(x => x.TiedToXrefAtOffset == activePart.Offset); + foreach (var dependent in dependents) + { + xrefPartToBytePositionOrder.Add(dependent.Offset); + } while (currentPart.Dictionary != null) { @@ -72,21 +79,21 @@ break; } - xrefSeqBytePos.Add(prevBytePos); + xrefPartToBytePositionOrder.Add(prevBytePos); // sanity check to prevent infinite loops - if (xrefSeqBytePos.Count >= parts.Count) + if (xrefPartToBytePositionOrder.Count >= parts.Count) { break; } } // have to reverse order so that later XRefs will overwrite previous ones - xrefSeqBytePos.Reverse(); + xrefPartToBytePositionOrder.Reverse(); } // merge used and sorted XRef/trailer - foreach (long bPos in xrefSeqBytePos) + foreach (long bPos in xrefPartToBytePositionOrder) { var currentObject = parts.First(x => x.Offset == bPos || x.Offset == bPos + offsetCorrection); if (currentObject.Dictionary != null) diff --git a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePart.cs b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePart.cs index 02b01b48..69a87fe5 100644 --- a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePart.cs +++ b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePart.cs @@ -33,13 +33,24 @@ public CrossReferenceType Type { get; } - public CrossReferenceTablePart(IReadOnlyDictionary objectOffsets, long offset, long previous, DictionaryToken dictionary, CrossReferenceType type) + /// + /// For Xref streams indicated by tables they should be used together when constructing the final table. + /// + public long? TiedToXrefAtOffset { get; } + + public CrossReferenceTablePart( + IReadOnlyDictionary objectOffsets, + long offset, long previous, + DictionaryToken dictionary, + CrossReferenceType type, + long? tiedToXrefAtOffset) { ObjectOffsets = objectOffsets; Offset = offset; Previous = previous; Dictionary = dictionary; Type = type; + TiedToXrefAtOffset = tiedToXrefAtOffset; } public void FixOffset(long offset) diff --git a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePartBuilder.cs b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePartBuilder.cs index 48ff1f44..d2ca13af 100644 --- a/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePartBuilder.cs +++ b/src/UglyToad.PdfPig/CrossReference/CrossReferenceTablePartBuilder.cs @@ -15,7 +15,9 @@ public DictionaryToken Dictionary { get; set; } public CrossReferenceType XRefType { get; set; } - + + public long? TiedToPreviousAtOffset { get; set; } + public void Add(long objectId, int generationNumber, long offset) { IndirectReference objKey = new IndirectReference(objectId, generationNumber); @@ -28,7 +30,7 @@ public CrossReferenceTablePart Build() { - return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType); + return new CrossReferenceTablePart(objects, Offset, Previous, Dictionary, XRefType, TiedToPreviousAtOffset); } } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs index 0f63cece..70cae4df 100644 --- a/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs +++ b/src/UglyToad.PdfPig/Parser/FileStructure/CrossReferenceParser.cs @@ -82,9 +82,11 @@ // check for a XRef stream, it may contain some object ids of compressed objects if (tableDictionary.ContainsKey(NameToken.XrefStm)) { - log.Debug("Cross reference table contained referenced to stream. Reading the stream."); + log.Debug("Cross reference table contained reference to stream. Reading the stream."); - int streamOffset = ((NumericToken)tableDictionary.Data[NameToken.XrefStm]).Int; + var tiedToTableAtOffset = tablePart.Offset; + + int streamOffset = ((NumericToken) tableDictionary.Data[NameToken.XrefStm]).Int; // check the xref stream reference fixedOffset = offsetValidator.CheckXRefOffset(streamOffset, tokenScanner, bytes, isLenientParsing); @@ -96,8 +98,13 @@ // Update the cross reference table to be a stream instead. tableDictionary = tableDictionary.With(NameToken.XrefStm, new NumericToken(streamOffset)); - tablePart = new CrossReferenceTablePart(tablePart.ObjectOffsets, streamOffset, - tablePart.Previous, tableDictionary, tablePart.Type); + tablePart = new CrossReferenceTablePart( + tablePart.ObjectOffsets, + streamOffset, + tablePart.Previous, + tableDictionary, + tablePart.Type, + tiedToTableAtOffset); } // Read the stream from the table. @@ -105,7 +112,7 @@ { try { - TryParseCrossReferenceStream(streamOffset, pdfScanner, out streamPart); + TryParseCrossReferenceStream(streamOffset, pdfScanner, tiedToTableAtOffset, out streamPart); } catch (InvalidOperationException ex) { @@ -149,7 +156,7 @@ tokenScanner.Seek(previousCrossReferenceLocation); // parse xref stream - if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, out var tablePart)) + if (!TryParseCrossReferenceStream(previousCrossReferenceLocation, pdfScanner, null, out var tablePart)) { if (!TryBruteForceXrefTableLocate(bytes, previousCrossReferenceLocation, out var actualOffset)) { @@ -218,7 +225,10 @@ return resolved; } - private bool TryParseCrossReferenceStream(long objByteOffset, IPdfTokenScanner pdfScanner, + private bool TryParseCrossReferenceStream( + long objByteOffset, + IPdfTokenScanner pdfScanner, + long? fromTableAtOffset, out CrossReferenceTablePart xrefTablePart) { xrefTablePart = null; @@ -236,7 +246,7 @@ return false; } - xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, objectStream); + xrefTablePart = crossReferenceStreamParser.Parse(objByteOffset, fromTableAtOffset, objectStream); return true; } diff --git a/src/UglyToad.PdfPig/Parser/Parts/CrossReference/CrossReferenceStreamParser.cs b/src/UglyToad.PdfPig/Parser/Parts/CrossReference/CrossReferenceStreamParser.cs index 02904ed7..e27d74a9 100644 --- a/src/UglyToad.PdfPig/Parser/Parts/CrossReference/CrossReferenceStreamParser.cs +++ b/src/UglyToad.PdfPig/Parser/Parts/CrossReference/CrossReferenceStreamParser.cs @@ -19,7 +19,7 @@ /// /// Parses through the unfiltered stream and populates the xrefTable HashMap. /// - public CrossReferenceTablePart Parse(long streamOffset, StreamToken stream) + public CrossReferenceTablePart Parse(long streamOffset, long? fromTableAtOffset, StreamToken stream) { var decoded = stream.Decode(filterProvider); @@ -38,7 +38,8 @@ Offset = streamOffset, Previous = previousOffset, Dictionary = stream.StreamDictionary, - XRefType = CrossReferenceType.Stream + XRefType = CrossReferenceType.Stream, + TiedToPreviousAtOffset = fromTableAtOffset }; var objectNumbers = GetObjectNumbers(stream.StreamDictionary);