From fe8b587da6730c10045f1707b80fd18044082361 Mon Sep 17 00:00:00 2001
From: Mikhail Korolev <michael443959@gmail.com>
Date: Mon, 22 Jun 2026 18:28:18 +0300
Subject: [PATCH] feat(ndc): support arbitrary input_encoding names and code
 pages (#15)

ResolveEncoding now accepts any encoding name (e.g. iso-8859-5) or
numeric code page (e.g. 28595) in addition to the latin1/utf-8/ascii
fast paths. Unknown or invalid values throw NotSupportedException with
a clear message instead of silently falling back to Latin1.

CodePagesEncodingProvider is auto-registered in a static constructor so
non-Latin1 ISO/Windows code pages (Cyrillic via ISO-8859-5, etc.) are
available without caller setup. The binary parse path coerces numeric
input_encoding values via ToString, matching NdcOptions.FromDictionary.

Closes #15
---
 .../NdcContentParser.cs                       | 58 +++++++++++---
 .../Content/Ndc/NdcBinaryParserTests.cs       | 75 ++++++++++++++++++-
 2 files changed, 118 insertions(+), 15 deletions(-)
diff --git a/src/FlexRender.Content.Ndc/NdcContentParser.cs b/src/FlexRender.Content.Ndc/NdcContentParser.cs
index 8c8cf54..8dd4510 100644
--- a/src/FlexRender.Content.Ndc/NdcContentParser.cs
+++ b/src/FlexRender.Content.Ndc/NdcContentParser.cs
@@ -13,6 +13,17 @@ namespace FlexRender.Content.Ndc;
 /// </summary>
 public sealed class NdcContentParser : IContentParser, IBinaryContentParser
 {
+    /// <summary>
+    /// Registers the code-pages encoding provider exactly once so that non-Latin1
+    /// ISO/Windows code pages (e.g. ISO-8859-5 / code page 28595) are available to
+    /// <see cref="Encoding.GetEncoding(string)"/> and <see cref="Encoding.GetEncoding(int)"/>.
+    /// Static initialization is thread-safe and runs before any member is accessed.
+    /// </summary>
+    static NdcContentParser()
+    {
+        Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
+    }
+
     /// <inheritdoc />
     public string FormatName => "ndc";
 
@@ -43,8 +54,8 @@ public IReadOnlyList<TemplateElement> Parse(ReadOnlyMemory<byte> data, ContentPa
             return [];
 
         var encodingName = "latin1";
-        if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is string encStr)
-            encodingName = encStr;
+        if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is not null)
+            encodingName = enc.ToString() ?? "latin1";
 
         var encoding = ResolveEncoding(encodingName);
         var textContent = encoding.GetString(data.Span);
@@ -52,21 +63,44 @@ public IReadOnlyList<TemplateElement> Parse(ReadOnlyMemory<byte> data, ContentPa
     }
 
     /// <summary>
-    /// Resolves a human-friendly encoding name to a <see cref="System.Text.Encoding"/> instance.
+    /// Resolves an encoding identifier to a <see cref="System.Text.Encoding"/> instance.
     /// </summary>
     /// <param name="name">
-    /// The encoding name. Supported values: <c>latin1</c>, <c>iso-8859-1</c>, <c>utf-8</c>,
-    /// <c>utf8</c>, <c>ascii</c>. Unrecognized values default to Latin-1.
+    /// The encoding identifier. All values are resolved through <see cref="Encoding.GetEncoding(string)"/>
+    /// (encoding names such as <c>latin1</c>, <c>iso-8859-1</c>, <c>iso-8859-5</c>, <c>utf-8</c>, <c>ascii</c>)
+    /// or <see cref="Encoding.GetEncoding(int)"/> (numeric code pages such as <c>28595</c>). Common names
+    /// already return the corresponding framework singletons. The dashless <c>utf8</c> alias is mapped to
+    /// <c>utf-8</c> for backward compatibility. Non-Latin1 ISO/Windows code pages are supported through the
+    /// registered code-pages provider.
     /// </param>
     /// <returns>The resolved <see cref="System.Text.Encoding"/>.</returns>
-    internal static Encoding ResolveEncoding(string name) =>
-        name.ToLowerInvariant() switch
+    /// <exception cref="ArgumentNullException">Thrown when <paramref name="name"/> is <see langword="null"/>.</exception>
+    /// <exception cref="NotSupportedException">
+    /// Thrown when <paramref name="name"/> does not correspond to a known encoding name or code page.
+    /// </exception>
+    internal static Encoding ResolveEncoding(string name)
+    {
+        ArgumentNullException.ThrowIfNull(name);
+
+        var trimmed = name.Trim();
+
+        // GetEncoding does not recognize the dashless "utf8" form; map it to the canonical "utf-8".
+        if (string.Equals(trimmed, "utf8", StringComparison.OrdinalIgnoreCase))
+            trimmed = "utf-8";
+
+        try
         {
-            "latin1" or "iso-8859-1" => Encoding.Latin1,
-            "utf-8" or "utf8" => Encoding.UTF8,
-            "ascii" => Encoding.ASCII,
-            _ => Encoding.Latin1
-        };
+            return int.TryParse(trimmed, System.Globalization.CultureInfo.InvariantCulture, out var codePage)
+                ? Encoding.GetEncoding(codePage)
+                : Encoding.GetEncoding(trimmed);
+        }
+        catch (Exception ex) when (ex is ArgumentException or NotSupportedException)
+        {
+            throw new NotSupportedException(
+                $"Unknown or unsupported input encoding: '{name}'. Use a known encoding name (e.g. 'iso-8859-5') or a numeric code page (e.g. '28595').",
+                ex);
+        }
+    }
 
     private static int CalculateMaxLineWidth(List<NdcToken> tokens, int tabWidth = 8)
     {
diff --git a/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs b/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs
index 0de61a8..c71d411 100644
--- a/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs
+++ b/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs
@@ -93,7 +93,8 @@ public void WithNdc_RegistersBothStringAndBinaryParser()
     [InlineData("utf-8")]
     [InlineData("utf8")]
     [InlineData("ascii")]
-    [InlineData("unknown")]
+    [InlineData("iso-8859-5")]
+    [InlineData("28595")]
     public void ResolveEncoding_ReturnsValidEncoding(string name)
     {
         var encoding = NdcContentParser.ResolveEncoding(name);
@@ -101,6 +102,16 @@ public void ResolveEncoding_ReturnsValidEncoding(string name)
         Assert.NotNull(encoding);
     }
 
+    [Theory]
+    [InlineData("iso-8859-5")]
+    [InlineData("28595")]
+    public void ResolveEncoding_Iso88595_ByNameOrCodePage_ResolvesToCodePage28595(string name)
+    {
+        var encoding = NdcContentParser.ResolveEncoding(name);
+
+        Assert.Equal(28595, encoding.CodePage);
+    }
+
     [Fact]
     public void ResolveEncoding_Latin1_ReturnsLatin1()
     {
@@ -122,8 +133,66 @@ public void ResolveEncoding_Ascii_ReturnsAscii()
     }
 
     [Fact]
-    public void ResolveEncoding_Unknown_DefaultsToLatin1()
+    public void ResolveEncoding_Unknown_Throws()
+    {
+        var ex = Assert.Throws<NotSupportedException>(
+            () => NdcContentParser.ResolveEncoding("something-else"));
+
+        Assert.Contains("something-else", ex.Message, StringComparison.Ordinal);
+    }
+
+    [Fact]
+    public void ResolveEncoding_UnknownCodePage_Throws()
+    {
+        var ex = Assert.Throws<NotSupportedException>(
+            () => NdcContentParser.ResolveEncoding("999999"));
+
+        Assert.Contains("999999", ex.Message, StringComparison.Ordinal);
+    }
+
+    [Fact]
+    public void ParseBytes_WithIso88595Encoding_DecodesCyrillic()
+    {
+        var parser = new NdcContentParser();
+        // Cyrillic text encoded with ISO-8859-5 (code page 28595). The default
+        // NDC charset uses "none" encoding, so the decoded Unicode survives unchanged.
+        // GetEncoding(28595) succeeds here because the parser's static constructor
+        // already registered the code-pages provider.
+        var text = "Привет";
+        var iso88595 = global::System.Text.Encoding.GetEncoding(28595);
+        var data = iso88595.GetBytes(text);
+        var options = new Dictionary<string, object>
+        {
+            ["input_encoding"] = "iso-8859-5"
+        };
+
+        var result = parser.Parse(data, EmptyContext, options);
+
+        var root = Assert.IsType<FlexElement>(Assert.Single(result));
+        var row = Assert.IsType<FlexElement>(root.Children[0]);
+        var textElement = Assert.IsType<TextElement>(row.Children[0]);
+        Assert.Equal("Привет", textElement.Content);
+    }
+
+    [Fact]
+    public void ParseBytes_WithNumericInputEncoding_DecodesCyrillic()
     {
-        Assert.Same(global::System.Text.Encoding.Latin1, NdcContentParser.ResolveEncoding("something-else"));
+        var parser = new NdcContentParser();
+        // input_encoding arrives as a boxed int (as YAML numeric scalars do), not a string.
+        // The binary path must coerce it to its string form so the numeric code page resolves.
+        var text = "Привет";
+        var iso88595 = global::System.Text.Encoding.GetEncoding(28595);
+        var data = iso88595.GetBytes(text);
+        var options = new Dictionary<string, object>
+        {
+            ["input_encoding"] = 28595
+        };
+
+        var result = parser.Parse(data, EmptyContext, options);
+
+        var root = Assert.IsType<FlexElement>(Assert.Single(result));
+        var row = Assert.IsType<FlexElement>(root.Children[0]);
+        var textElement = Assert.IsType<TextElement>(row.Children[0]);
+        Assert.Equal("Привет", textElement.Content);
     }
 }