map missing character codes directly #44

previously if no matching unicode was found for a character code we would return a null letter. instead we now map from the character code directly to a character. this seems to work for most documents, except where there are ligatures, e.g. fi or ff, but is still better than not returning anything.
2025-04-05 20:55:01 +08:00 · 2019-07-07 13:53:25 +01:00 · 2019-07-07 13:53:25 +01:00 · 557d8bc948
commit 557d8bc948
parent 198cca1336
2 changed files with 3 additions and 1 deletions
--- a/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
+++ b/src/UglyToad.PdfPig.Tests/Integration/LaTexTests.cs
@ -24,7 +24,7 @@

                var page2 = document.GetPage(2);

-                Assert.Contains("isθc={θc1,θc2,...,θcn},", page2.Text);
+                Assert.Contains("is~θc={θc1,θc2,...,θcn},", page2.Text);
            }
        }

--- a/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
+++ b/src/UglyToad.PdfPig/Graphics/ContentStreamProcessor.cs
@ -137,6 +137,8 @@
                if (!foundUnicode || unicode == null)
                {
                    log.Warn($"We could not find the corresponding character with code {code} in font {font.Name}.");
+                    // Try casting directly to string as in PDFBox 1.8.
+                    unicode = new string((char)code, 1);
                }

                var wordSpacing = 0m;