fix #176, allow startxref to appear earlier in the document

2025-04-05 20:55:01 +08:00 · 2020-05-31 17:01:38 +01:00 · 2020-05-31 17:01:38 +01:00 · bf45602ac5
commit bf45602ac5
parent 4312aa470e
2 changed files with 51 additions and 29 deletions
--- a/src/UglyToad.PdfPig.Tests/Integration/SinglePageSimpleOpenOfficeTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/SinglePageSimpleOpenOfficeTests.cs
@ -90,5 +90,21 @@
                Assert.False(document.TryGetBookmarks(out _));
            }
        }
+
+        [Fact]
+        public void StartXRefNotNearEnd()
+        {
+            var bytes = File.ReadAllBytes(GetFilename());
+
+            var emptyTrailer = new byte[2026];
+            emptyTrailer[0] = 10;
+
+            bytes = bytes.Concat(emptyTrailer).ToArray();
+
+            using (var document = PdfDocument.Open(bytes, ParsingOptions.LenientParsingOff))
+            {
+                Assert.Equal(1, document.NumberOfPages);
+            }
+        }
    }
 }
--- a/src/UglyToad.PdfPig/Parser/FileStructure/FileTrailerParser.cs
+++ b/src/UglyToad.PdfPig/Parser/FileStructure/FileTrailerParser.cs
@ -38,7 +38,7 @@
            (byte) 'e',
            (byte) 'f'
        };
-        
+
        public static long GetFirstCrossReferenceOffset(IInputBytes bytes, ISeekableTokenScanner scanner, bool isLenientParsing)
        {
            if (bytes == null)
@ -55,10 +55,6 @@

            var offsetFromEnd = fileLength < EndOfFileSearchRange ? (int)fileLength : EndOfFileSearchRange;

-            var startPosition = fileLength - offsetFromEnd;
-
-            bytes.Seek(startPosition);
-
            var startXrefPosition = GetStartXrefPosition(bytes, offsetFromEnd);

            scanner.Seek(startXrefPosition);
@ -96,38 +92,48 @@
            var startXrefs = new List<int>();

            var index = 0;
-            var offset = 0;
-            
-            // Starting scanning the last 1024 bytes.
-            while (bytes.MoveNext())
+
+            var fileLength = bytes.Length;
+            var multiple = 1;
+
+            var actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
+            do
            {
-                offset++;
-                if (bytes.CurrentByte == StartXRefBytes[index])
+                multiple *= 2;
+                bytes.Seek(actualStartOffset);
+
+                // Starting scanning the file bytes.
+                while (bytes.MoveNext())
                {
-                    // We might be reading "startxref".
-                    index++;
-                }
-                else
-                {
-                    index = 0;
+                    if (bytes.CurrentByte == StartXRefBytes[index])
+                    {
+                        // We might be reading "startxref".
+                        index++;
+                    }
+                    else
+                    {
+                        index = 0;
+                    }
+
+                    if (index == StartXRefBytes.Length)
+                    {
+                        // Add this "startxref" (position from the start of the document to the first 's').
+                        startXrefs.Add((int)bytes.CurrentOffset - StartXRefBytes.Length);
+
+                        // Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
+                        index = 0;
+                    }
                }

-                if (index == StartXRefBytes.Length)
-                {
-                    // Add this "startxref" (position from the end of the document to the first 's').
-                    startXrefs.Add(offsetFromEnd - (offset - StartXRefBytes.Length));
-
-                    // Continue scanning in case there are further "startxref"s. Not sure if this ever happens.
-                    index = 0;
-                }
-            }
-
+                actualStartOffset = Math.Max(0, fileLength - (offsetFromEnd * multiple));
+            } while (startXrefs.Count == 0 && actualStartOffset > 0);
+            
            if (startXrefs.Count == 0)
            {
-                throw new PdfDocumentFormatException("Could not find the startxref within the last 1024 characters.");
+                throw new PdfDocumentFormatException($"Could not find the startxref within the last {offsetFromEnd} characters.");
            }

-            return bytes.Length - startXrefs[startXrefs.Count - 1];
+            return startXrefs[startXrefs.Count - 1];
        }
    }
 }