From fe8b587da6730c10045f1707b80fd18044082361 Mon Sep 17 00:00:00 2001 From: Mikhail Korolev Date: Mon, 22 Jun 2026 18:28:18 +0300 Subject: [PATCH] feat(ndc): support arbitrary input_encoding names and code pages (#15) ResolveEncoding now accepts any encoding name (e.g. iso-8859-5) or numeric code page (e.g. 28595) in addition to the latin1/utf-8/ascii fast paths. Unknown or invalid values throw NotSupportedException with a clear message instead of silently falling back to Latin1. CodePagesEncodingProvider is auto-registered in a static constructor so non-Latin1 ISO/Windows code pages (Cyrillic via ISO-8859-5, etc.) are available without caller setup. The binary parse path coerces numeric input_encoding values via ToString, matching NdcOptions.FromDictionary. Closes #15 --- .../NdcContentParser.cs | 58 +++++++++++--- .../Content/Ndc/NdcBinaryParserTests.cs | 75 ++++++++++++++++++- 2 files changed, 118 insertions(+), 15 deletions(-) diff --git a/src/FlexRender.Content.Ndc/NdcContentParser.cs b/src/FlexRender.Content.Ndc/NdcContentParser.cs index 8c8cf54..8dd4510 100644 --- a/src/FlexRender.Content.Ndc/NdcContentParser.cs +++ b/src/FlexRender.Content.Ndc/NdcContentParser.cs @@ -13,6 +13,17 @@ namespace FlexRender.Content.Ndc; /// public sealed class NdcContentParser : IContentParser, IBinaryContentParser { + /// + /// Registers the code-pages encoding provider exactly once so that non-Latin1 + /// ISO/Windows code pages (e.g. ISO-8859-5 / code page 28595) are available to + /// and . + /// Static initialization is thread-safe and runs before any member is accessed. + /// + static NdcContentParser() + { + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + } + /// public string FormatName => "ndc"; @@ -43,8 +54,8 @@ public IReadOnlyList Parse(ReadOnlyMemory data, ContentPa return []; var encodingName = "latin1"; - if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is string encStr) - encodingName = encStr; + if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is not null) + encodingName = enc.ToString() ?? "latin1"; var encoding = ResolveEncoding(encodingName); var textContent = encoding.GetString(data.Span); @@ -52,21 +63,44 @@ public IReadOnlyList Parse(ReadOnlyMemory data, ContentPa } /// - /// Resolves a human-friendly encoding name to a instance. + /// Resolves an encoding identifier to a instance. /// /// - /// The encoding name. Supported values: latin1, iso-8859-1, utf-8, - /// utf8, ascii. Unrecognized values default to Latin-1. + /// The encoding identifier. All values are resolved through + /// (encoding names such as latin1, iso-8859-1, iso-8859-5, utf-8, ascii) + /// or (numeric code pages such as 28595). Common names + /// already return the corresponding framework singletons. The dashless utf8 alias is mapped to + /// utf-8 for backward compatibility. Non-Latin1 ISO/Windows code pages are supported through the + /// registered code-pages provider. /// /// The resolved . - internal static Encoding ResolveEncoding(string name) => - name.ToLowerInvariant() switch + /// Thrown when is . + /// + /// Thrown when does not correspond to a known encoding name or code page. + /// + internal static Encoding ResolveEncoding(string name) + { + ArgumentNullException.ThrowIfNull(name); + + var trimmed = name.Trim(); + + // GetEncoding does not recognize the dashless "utf8" form; map it to the canonical "utf-8". + if (string.Equals(trimmed, "utf8", StringComparison.OrdinalIgnoreCase)) + trimmed = "utf-8"; + + try { - "latin1" or "iso-8859-1" => Encoding.Latin1, - "utf-8" or "utf8" => Encoding.UTF8, - "ascii" => Encoding.ASCII, - _ => Encoding.Latin1 - }; + return int.TryParse(trimmed, System.Globalization.CultureInfo.InvariantCulture, out var codePage) + ? Encoding.GetEncoding(codePage) + : Encoding.GetEncoding(trimmed); + } + catch (Exception ex) when (ex is ArgumentException or NotSupportedException) + { + throw new NotSupportedException( + $"Unknown or unsupported input encoding: '{name}'. Use a known encoding name (e.g. 'iso-8859-5') or a numeric code page (e.g. '28595').", + ex); + } + } private static int CalculateMaxLineWidth(List tokens, int tabWidth = 8) { diff --git a/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs b/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs index 0de61a8..c71d411 100644 --- a/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs +++ b/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs @@ -93,7 +93,8 @@ public void WithNdc_RegistersBothStringAndBinaryParser() [InlineData("utf-8")] [InlineData("utf8")] [InlineData("ascii")] - [InlineData("unknown")] + [InlineData("iso-8859-5")] + [InlineData("28595")] public void ResolveEncoding_ReturnsValidEncoding(string name) { var encoding = NdcContentParser.ResolveEncoding(name); @@ -101,6 +102,16 @@ public void ResolveEncoding_ReturnsValidEncoding(string name) Assert.NotNull(encoding); } + [Theory] + [InlineData("iso-8859-5")] + [InlineData("28595")] + public void ResolveEncoding_Iso88595_ByNameOrCodePage_ResolvesToCodePage28595(string name) + { + var encoding = NdcContentParser.ResolveEncoding(name); + + Assert.Equal(28595, encoding.CodePage); + } + [Fact] public void ResolveEncoding_Latin1_ReturnsLatin1() { @@ -122,8 +133,66 @@ public void ResolveEncoding_Ascii_ReturnsAscii() } [Fact] - public void ResolveEncoding_Unknown_DefaultsToLatin1() + public void ResolveEncoding_Unknown_Throws() + { + var ex = Assert.Throws( + () => NdcContentParser.ResolveEncoding("something-else")); + + Assert.Contains("something-else", ex.Message, StringComparison.Ordinal); + } + + [Fact] + public void ResolveEncoding_UnknownCodePage_Throws() + { + var ex = Assert.Throws( + () => NdcContentParser.ResolveEncoding("999999")); + + Assert.Contains("999999", ex.Message, StringComparison.Ordinal); + } + + [Fact] + public void ParseBytes_WithIso88595Encoding_DecodesCyrillic() + { + var parser = new NdcContentParser(); + // Cyrillic text encoded with ISO-8859-5 (code page 28595). The default + // NDC charset uses "none" encoding, so the decoded Unicode survives unchanged. + // GetEncoding(28595) succeeds here because the parser's static constructor + // already registered the code-pages provider. + var text = "Привет"; + var iso88595 = global::System.Text.Encoding.GetEncoding(28595); + var data = iso88595.GetBytes(text); + var options = new Dictionary + { + ["input_encoding"] = "iso-8859-5" + }; + + var result = parser.Parse(data, EmptyContext, options); + + var root = Assert.IsType(Assert.Single(result)); + var row = Assert.IsType(root.Children[0]); + var textElement = Assert.IsType(row.Children[0]); + Assert.Equal("Привет", textElement.Content); + } + + [Fact] + public void ParseBytes_WithNumericInputEncoding_DecodesCyrillic() { - Assert.Same(global::System.Text.Encoding.Latin1, NdcContentParser.ResolveEncoding("something-else")); + var parser = new NdcContentParser(); + // input_encoding arrives as a boxed int (as YAML numeric scalars do), not a string. + // The binary path must coerce it to its string form so the numeric code page resolves. + var text = "Привет"; + var iso88595 = global::System.Text.Encoding.GetEncoding(28595); + var data = iso88595.GetBytes(text); + var options = new Dictionary + { + ["input_encoding"] = 28595 + }; + + var result = parser.Parse(data, EmptyContext, options); + + var root = Assert.IsType(Assert.Single(result)); + var row = Assert.IsType(root.Children[0]); + var textElement = Assert.IsType(row.Children[0]); + Assert.Equal("Привет", textElement.Content); } }