From 9bfe69aef1d19f2132d8a82e94f2792c65872063 Mon Sep 17 00:00:00 2001 From: Plaisted Date: Tue, 19 Jan 2021 18:06:50 -0600 Subject: [PATCH] removing locking --- .../NumericTokenizer.cs | 316 +++++----- .../Scanner/CoreTokenScanner.cs | 7 +- .../StringTokenizer.cs | 560 +++++++++--------- .../UglyToad.PdfPig.Tokenization.csproj | 48 +- 4 files changed, 466 insertions(+), 465 deletions(-) diff --git a/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs index 3b3dd813..798892f4 100644 --- a/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/NumericTokenizer.cs @@ -1,158 +1,158 @@ -namespace UglyToad.PdfPig.Tokenization -{ - using System; - using System.Globalization; - using System.Text; - using Core; - using Tokens; - - internal class NumericTokenizer : ITokenizer - { - private static readonly StringBuilderPool StringBuilderPool = new StringBuilderPool(10); - - private const byte Zero = 48; - private const byte Nine = 57; - - public bool ReadsNextByte { get; } = true; - - public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) - { - token = null; - - StringBuilder characters; - - if ((currentByte >= Zero && currentByte <= Nine) || currentByte == '-' || currentByte == '+' || currentByte == '.') - { - characters = StringBuilderPool.Borrow(); - characters.Append((char)currentByte); - } - else - { - return false; - } - - while (inputBytes.MoveNext()) - { - var b = inputBytes.CurrentByte; - - if ((b >= Zero && b <= Nine) || - b == '-' || - b == '+' || - b == '.' || - b == 'E' || - b == 'e') - { - characters.Append((char)b); - } - else - { - break; - } - } - - try - { - var str = characters.ToString(); - StringBuilderPool.Return(characters); - - switch (str) - { - case "-1": - token = NumericToken.MinusOne; - return true; - case "-": - case ".": - case "0": - case "0000": - token = NumericToken.Zero; - return true; - case "1": - token = NumericToken.One; - return true; - case "2": - token = NumericToken.Two; - return true; - case "3": - token = NumericToken.Three; - return true; - case "4": - token = NumericToken.Four; - return true; - case "5": - token = NumericToken.Five; - return true; - case "6": - token = NumericToken.Six; - return true; - case "7": - token = NumericToken.Seven; - return true; - case "8": - token = NumericToken.Eight; - return true; - case "9": - token = NumericToken.Nine; - return true; - case "10": - token = NumericToken.Ten; - return true; - case "11": - token = NumericToken.Eleven; - return true; - case "12": - token = NumericToken.Twelve; - return true; - case "13": - token = NumericToken.Thirteen; - return true; - case "14": - token = NumericToken.Fourteen; - return true; - case "15": - token = NumericToken.Fifteen; - return true; - case "16": - token = NumericToken.Sixteen; - return true; - case "17": - token = NumericToken.Seventeen; - return true; - case "18": - token = NumericToken.Eighteen; - return true; - case "19": - token = NumericToken.Nineteen; - return true; - case "20": - token = NumericToken.Twenty; - return true; - case "100": - token = NumericToken.OneHundred; - return true; - case "500": - token = NumericToken.FiveHundred; - return true; - case "1000": - token = NumericToken.OneThousand; - return true; - default: - if (!decimal.TryParse(str, NumberStyles.Any, CultureInfo.InvariantCulture, out var value)) - { - return false; - } - - token = new NumericToken(value); - return true; - } - } - catch (FormatException) - { - return false; - } - catch (OverflowException) - { - return false; - } - } - } -} +namespace UglyToad.PdfPig.Tokenization +{ + using System; + using System.Globalization; + using System.Text; + using Core; + using Tokens; + + internal class NumericTokenizer : ITokenizer + { + private readonly StringBuilder stringBuilder = new(); + + private const byte Zero = 48; + private const byte Nine = 57; + + public bool ReadsNextByte { get; } = true; + + public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) + { + token = null; + + StringBuilder characters; + + if ((currentByte >= Zero && currentByte <= Nine) || currentByte == '-' || currentByte == '+' || currentByte == '.') + { + characters = stringBuilder; + characters.Append((char)currentByte); + } + else + { + return false; + } + + while (inputBytes.MoveNext()) + { + var b = inputBytes.CurrentByte; + + if ((b >= Zero && b <= Nine) || + b == '-' || + b == '+' || + b == '.' || + b == 'E' || + b == 'e') + { + characters.Append((char)b); + } + else + { + break; + } + } + + try + { + var str = characters.ToString(); + characters.Clear(); + + switch (str) + { + case "-1": + token = NumericToken.MinusOne; + return true; + case "-": + case ".": + case "0": + case "0000": + token = NumericToken.Zero; + return true; + case "1": + token = NumericToken.One; + return true; + case "2": + token = NumericToken.Two; + return true; + case "3": + token = NumericToken.Three; + return true; + case "4": + token = NumericToken.Four; + return true; + case "5": + token = NumericToken.Five; + return true; + case "6": + token = NumericToken.Six; + return true; + case "7": + token = NumericToken.Seven; + return true; + case "8": + token = NumericToken.Eight; + return true; + case "9": + token = NumericToken.Nine; + return true; + case "10": + token = NumericToken.Ten; + return true; + case "11": + token = NumericToken.Eleven; + return true; + case "12": + token = NumericToken.Twelve; + return true; + case "13": + token = NumericToken.Thirteen; + return true; + case "14": + token = NumericToken.Fourteen; + return true; + case "15": + token = NumericToken.Fifteen; + return true; + case "16": + token = NumericToken.Sixteen; + return true; + case "17": + token = NumericToken.Seventeen; + return true; + case "18": + token = NumericToken.Eighteen; + return true; + case "19": + token = NumericToken.Nineteen; + return true; + case "20": + token = NumericToken.Twenty; + return true; + case "100": + token = NumericToken.OneHundred; + return true; + case "500": + token = NumericToken.FiveHundred; + return true; + case "1000": + token = NumericToken.OneThousand; + return true; + default: + if (!decimal.TryParse(str, NumberStyles.Any, CultureInfo.InvariantCulture, out var value)) + { + return false; + } + + token = new NumericToken(value); + return true; + } + } + catch (FormatException) + { + return false; + } + catch (OverflowException) + { + return false; + } + } + } +} diff --git a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs index 22f68b34..5182ba59 100644 --- a/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs +++ b/src/UglyToad.PdfPig.Tokenization/Scanner/CoreTokenScanner.cs @@ -15,9 +15,10 @@ private static readonly DictionaryTokenizer DictionaryTokenizer = new DictionaryTokenizer(); private static readonly HexTokenizer HexTokenizer = new HexTokenizer(); private static readonly NameTokenizer NameTokenizer = new NameTokenizer(); - private static readonly NumericTokenizer NumericTokenizer = new NumericTokenizer(); - private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer(); - private static readonly StringTokenizer StringTokenizer = new StringTokenizer(); + private static readonly PlainTokenizer PlainTokenizer = new PlainTokenizer(); + + private readonly NumericTokenizer NumericTokenizer = new NumericTokenizer(); + private readonly StringTokenizer StringTokenizer = new StringTokenizer(); private readonly ScannerScope scope; private readonly IInputBytes inputBytes; diff --git a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs index bc68e919..1b9985f8 100644 --- a/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs +++ b/src/UglyToad.PdfPig.Tokenization/StringTokenizer.cs @@ -1,281 +1,281 @@ -namespace UglyToad.PdfPig.Tokenization -{ - using System.Text; - using Core; - using Tokens; - - internal class StringTokenizer : ITokenizer - { - private static readonly StringBuilderPool StringBuilderPool = new StringBuilderPool(16); - public bool ReadsNextByte { get; } = false; - - public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) - { - token = null; - - if (inputBytes == null) - { - return false; - } - - if (currentByte != '(') - { - return false; - } - - var builder = StringBuilderPool.Borrow(); - var numberOfBrackets = 1; - var isEscapeActive = false; - var isLineBreaking = false; - - var octalModeActive = false; - - short[] octal = { 0, 0, 0 }; - var octalsRead = 0; - - while (inputBytes.MoveNext()) - { - var b = inputBytes.CurrentByte; - var c = (char)b; - - if (octalModeActive) - { - var nextCharacterOctal = c >= '0' && c <= '7'; - - if (nextCharacterOctal) - { - // left shift the octals. - LeftShiftOctal(c, octalsRead, octal); - octalsRead++; - } - - if (octalsRead == 3 || !nextCharacterOctal) - { - var characterCode = OctalHelpers.FromOctalDigits(octal); - - // For now :( - // TODO: I have a sneaking suspicion this is wrong, not sure what behaviour is for large octal numbers - builder.Append((char)characterCode); - - octal[0] = 0; - octal[1] = 0; - octal[2] = 0; - octalsRead = 0; - octalModeActive = false; - } - - if (nextCharacterOctal) - { - continue; - } - } - - switch (c) - { - case ')': - isLineBreaking = false; - if (!isEscapeActive) - { - numberOfBrackets--; - } - - isEscapeActive = false; - if (numberOfBrackets > 0) - { - builder.Append(c); - } - - // TODO: Check for other ends of string where the string is improperly formatted. See commented method - numberOfBrackets = CheckForEndOfString(numberOfBrackets, inputBytes); - - break; - case '(': - isLineBreaking = false; - - if (!isEscapeActive) - { - numberOfBrackets++; - } - - isEscapeActive = false; - builder.Append(c); - break; - // Escape - case '\\': - isLineBreaking = false; - // Escaped backslash - if (isEscapeActive) - { - builder.Append(c); - isEscapeActive = false; - } - else - { - isEscapeActive = true; - } - break; - default: - if (isLineBreaking) - { - if (ReadHelper.IsEndOfLine(c)) - { - continue; - } - - isLineBreaking = false; - builder.Append(c); - } - else if (isEscapeActive) - { - ProcessEscapedCharacter(c, builder, octal, ref octalModeActive, ref octalsRead, ref isLineBreaking); - isEscapeActive = false; - } - else - { - builder.Append(c); - } - - break; - } - - if (numberOfBrackets <= 0) - { - break; - } - } - - StringToken.Encoding encodedWith; - string tokenStr; - if (builder.Length >= 2) - { - if (builder[0] == 0xFE && builder[1] == 0xFF) - { - var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); - - tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1); - - encodedWith = StringToken.Encoding.Utf16BE; - } - else if (builder[0] == 0xFF && builder[1] == 0xFE) - { - var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); - - tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1); - - encodedWith = StringToken.Encoding.Utf16; - } - else - { - tokenStr = builder.ToString(); - - encodedWith = StringToken.Encoding.Iso88591; - } - } - else - { - tokenStr = builder.ToString(); - - encodedWith = StringToken.Encoding.Iso88591; - } - - StringBuilderPool.Return(builder); - - token = new StringToken(tokenStr, encodedWith); - - return true; - } - - private static void LeftShiftOctal(char nextOctalChar, int octalsRead, short[] octals) - { - for (var i = octalsRead; i > 0; i--) - { - octals[i] = octals[i - 1]; - } - - var value = nextOctalChar.CharacterToShort(); - - octals[0] = value; - } - - private static void ProcessEscapedCharacter(char c, StringBuilder builder, short[] octal, ref bool isOctalActive, - ref int octalsRead, ref bool isLineBreaking) - { - switch (c) - { - case 'n': - builder.Append('\n'); - break; - case 'r': - builder.Append('\r'); - break; - case 't': - builder.Append('\t'); - break; - case 'b': - builder.Append('\b'); - break; - case 'f': - builder.Append('\f'); - break; - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - octal[0] = c.CharacterToShort(); - isOctalActive = true; - octalsRead = 1; - break; - default: - if (c == ReadHelper.AsciiCarriageReturn || c == ReadHelper.AsciiLineFeed) - { - isLineBreaking = true; - } - else - { - // Drop the backslash - builder.Append(c); - } - break; - } - } - - private static int CheckForEndOfString(int numberOfBrackets, IInputBytes bytes) - { - const byte lineFeed = 10; - const byte carriageReturn = 13; - - var braces = numberOfBrackets; - var nextThreeBytes = new byte[3]; - - var startAt = bytes.CurrentOffset; - - var amountRead = bytes.Read(nextThreeBytes); - - // Check the next 3 bytes if available - // The following cases are valid indicators for the end of the string - // 1. Next line contains another COSObject: CR + LF + '/' - // 2. COSDictionary ends in the next line: CR + LF + '>' - // 3. Next line contains another COSObject: CR + '/' - // 4. COSDictionary ends in the next line: CR + '>' - if (amountRead == 3 && nextThreeBytes[0] == carriageReturn) - { - if ((nextThreeBytes[1] == lineFeed && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>') - || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>') - { - braces = 0; - } - } - - if (amountRead > 0) - { - bytes.Seek(startAt); - } - - return braces; - } - } +namespace UglyToad.PdfPig.Tokenization +{ + using System.Text; + using Core; + using Tokens; + + internal class StringTokenizer : ITokenizer + { + private readonly StringBuilder stringBuilder = new(); + public bool ReadsNextByte { get; } = false; + + public bool TryTokenize(byte currentByte, IInputBytes inputBytes, out IToken token) + { + token = null; + + if (inputBytes == null) + { + return false; + } + + if (currentByte != '(') + { + return false; + } + + var builder = stringBuilder; + var numberOfBrackets = 1; + var isEscapeActive = false; + var isLineBreaking = false; + + var octalModeActive = false; + + short[] octal = { 0, 0, 0 }; + var octalsRead = 0; + + while (inputBytes.MoveNext()) + { + var b = inputBytes.CurrentByte; + var c = (char)b; + + if (octalModeActive) + { + var nextCharacterOctal = c >= '0' && c <= '7'; + + if (nextCharacterOctal) + { + // left shift the octals. + LeftShiftOctal(c, octalsRead, octal); + octalsRead++; + } + + if (octalsRead == 3 || !nextCharacterOctal) + { + var characterCode = OctalHelpers.FromOctalDigits(octal); + + // For now :( + // TODO: I have a sneaking suspicion this is wrong, not sure what behaviour is for large octal numbers + builder.Append((char)characterCode); + + octal[0] = 0; + octal[1] = 0; + octal[2] = 0; + octalsRead = 0; + octalModeActive = false; + } + + if (nextCharacterOctal) + { + continue; + } + } + + switch (c) + { + case ')': + isLineBreaking = false; + if (!isEscapeActive) + { + numberOfBrackets--; + } + + isEscapeActive = false; + if (numberOfBrackets > 0) + { + builder.Append(c); + } + + // TODO: Check for other ends of string where the string is improperly formatted. See commented method + numberOfBrackets = CheckForEndOfString(numberOfBrackets, inputBytes); + + break; + case '(': + isLineBreaking = false; + + if (!isEscapeActive) + { + numberOfBrackets++; + } + + isEscapeActive = false; + builder.Append(c); + break; + // Escape + case '\\': + isLineBreaking = false; + // Escaped backslash + if (isEscapeActive) + { + builder.Append(c); + isEscapeActive = false; + } + else + { + isEscapeActive = true; + } + break; + default: + if (isLineBreaking) + { + if (ReadHelper.IsEndOfLine(c)) + { + continue; + } + + isLineBreaking = false; + builder.Append(c); + } + else if (isEscapeActive) + { + ProcessEscapedCharacter(c, builder, octal, ref octalModeActive, ref octalsRead, ref isLineBreaking); + isEscapeActive = false; + } + else + { + builder.Append(c); + } + + break; + } + + if (numberOfBrackets <= 0) + { + break; + } + } + + StringToken.Encoding encodedWith; + string tokenStr; + if (builder.Length >= 2) + { + if (builder[0] == 0xFE && builder[1] == 0xFF) + { + var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); + + tokenStr = Encoding.BigEndianUnicode.GetString(rawBytes).Substring(1); + + encodedWith = StringToken.Encoding.Utf16BE; + } + else if (builder[0] == 0xFF && builder[1] == 0xFE) + { + var rawBytes = OtherEncodings.StringAsLatin1Bytes(builder.ToString()); + + tokenStr = Encoding.Unicode.GetString(rawBytes).Substring(1); + + encodedWith = StringToken.Encoding.Utf16; + } + else + { + tokenStr = builder.ToString(); + + encodedWith = StringToken.Encoding.Iso88591; + } + } + else + { + tokenStr = builder.ToString(); + + encodedWith = StringToken.Encoding.Iso88591; + } + + builder.Clear(); + + token = new StringToken(tokenStr, encodedWith); + + return true; + } + + private static void LeftShiftOctal(char nextOctalChar, int octalsRead, short[] octals) + { + for (var i = octalsRead; i > 0; i--) + { + octals[i] = octals[i - 1]; + } + + var value = nextOctalChar.CharacterToShort(); + + octals[0] = value; + } + + private static void ProcessEscapedCharacter(char c, StringBuilder builder, short[] octal, ref bool isOctalActive, + ref int octalsRead, ref bool isLineBreaking) + { + switch (c) + { + case 'n': + builder.Append('\n'); + break; + case 'r': + builder.Append('\r'); + break; + case 't': + builder.Append('\t'); + break; + case 'b': + builder.Append('\b'); + break; + case 'f': + builder.Append('\f'); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + octal[0] = c.CharacterToShort(); + isOctalActive = true; + octalsRead = 1; + break; + default: + if (c == ReadHelper.AsciiCarriageReturn || c == ReadHelper.AsciiLineFeed) + { + isLineBreaking = true; + } + else + { + // Drop the backslash + builder.Append(c); + } + break; + } + } + + private static int CheckForEndOfString(int numberOfBrackets, IInputBytes bytes) + { + const byte lineFeed = 10; + const byte carriageReturn = 13; + + var braces = numberOfBrackets; + var nextThreeBytes = new byte[3]; + + var startAt = bytes.CurrentOffset; + + var amountRead = bytes.Read(nextThreeBytes); + + // Check the next 3 bytes if available + // The following cases are valid indicators for the end of the string + // 1. Next line contains another COSObject: CR + LF + '/' + // 2. COSDictionary ends in the next line: CR + LF + '>' + // 3. Next line contains another COSObject: CR + '/' + // 4. COSDictionary ends in the next line: CR + '>' + if (amountRead == 3 && nextThreeBytes[0] == carriageReturn) + { + if ((nextThreeBytes[1] == lineFeed && (nextThreeBytes[2] == '/') || nextThreeBytes[2] == '>') + || nextThreeBytes[1] == '/' || nextThreeBytes[1] == '>') + { + braces = 0; + } + } + + if (amountRead > 0) + { + bytes.Seek(startAt); + } + + return braces; + } + } } \ No newline at end of file diff --git a/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj b/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj index a8effc40..fa40e532 100644 --- a/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj +++ b/src/UglyToad.PdfPig.Tokenization/UglyToad.PdfPig.Tokenization.csproj @@ -1,25 +1,25 @@ - - - netstandard2.0;net45;net451;net452;net46;net461;net462;net47 - latest - 0.1.4 - False - true - true - ..\pdfpig.snk - - - true - - - - - - - - - - - - + + + netstandard2.0;net45;net451;net452;net46;net461;net462;net47 + latest + 0.1.4 + False + true + true + ..\pdfpig.snk + + + true + + + + + + + + + + + + \ No newline at end of file