diff --git a/src/FlexRender.Content.Ndc/NdcContentParser.cs b/src/FlexRender.Content.Ndc/NdcContentParser.cs
index 8c8cf54..8dd4510 100644
--- a/src/FlexRender.Content.Ndc/NdcContentParser.cs
+++ b/src/FlexRender.Content.Ndc/NdcContentParser.cs
@@ -13,6 +13,17 @@ namespace FlexRender.Content.Ndc;
///
public sealed class NdcContentParser : IContentParser, IBinaryContentParser
{
+ ///
+ /// Registers the code-pages encoding provider exactly once so that non-Latin1
+ /// ISO/Windows code pages (e.g. ISO-8859-5 / code page 28595) are available to
+ /// and .
+ /// Static initialization is thread-safe and runs before any member is accessed.
+ ///
+ static NdcContentParser()
+ {
+ Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
+ }
+
///
public string FormatName => "ndc";
@@ -43,8 +54,8 @@ public IReadOnlyList Parse(ReadOnlyMemory data, ContentPa
return [];
var encodingName = "latin1";
- if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is string encStr)
- encodingName = encStr;
+ if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is not null)
+ encodingName = enc.ToString() ?? "latin1";
var encoding = ResolveEncoding(encodingName);
var textContent = encoding.GetString(data.Span);
@@ -52,21 +63,44 @@ public IReadOnlyList Parse(ReadOnlyMemory data, ContentPa
}
///
- /// Resolves a human-friendly encoding name to a instance.
+ /// Resolves an encoding identifier to a instance.
///
///
- /// The encoding name. Supported values: latin1, iso-8859-1, utf-8,
- /// utf8, ascii. Unrecognized values default to Latin-1.
+ /// The encoding identifier. All values are resolved through
+ /// (encoding names such as latin1, iso-8859-1, iso-8859-5, utf-8, ascii)
+ /// or (numeric code pages such as 28595). Common names
+ /// already return the corresponding framework singletons. The dashless utf8 alias is mapped to
+ /// utf-8 for backward compatibility. Non-Latin1 ISO/Windows code pages are supported through the
+ /// registered code-pages provider.
///
/// The resolved .
- internal static Encoding ResolveEncoding(string name) =>
- name.ToLowerInvariant() switch
+ /// Thrown when is .
+ ///
+ /// Thrown when does not correspond to a known encoding name or code page.
+ ///
+ internal static Encoding ResolveEncoding(string name)
+ {
+ ArgumentNullException.ThrowIfNull(name);
+
+ var trimmed = name.Trim();
+
+ // GetEncoding does not recognize the dashless "utf8" form; map it to the canonical "utf-8".
+ if (string.Equals(trimmed, "utf8", StringComparison.OrdinalIgnoreCase))
+ trimmed = "utf-8";
+
+ try
{
- "latin1" or "iso-8859-1" => Encoding.Latin1,
- "utf-8" or "utf8" => Encoding.UTF8,
- "ascii" => Encoding.ASCII,
- _ => Encoding.Latin1
- };
+ return int.TryParse(trimmed, System.Globalization.CultureInfo.InvariantCulture, out var codePage)
+ ? Encoding.GetEncoding(codePage)
+ : Encoding.GetEncoding(trimmed);
+ }
+ catch (Exception ex) when (ex is ArgumentException or NotSupportedException)
+ {
+ throw new NotSupportedException(
+ $"Unknown or unsupported input encoding: '{name}'. Use a known encoding name (e.g. 'iso-8859-5') or a numeric code page (e.g. '28595').",
+ ex);
+ }
+ }
private static int CalculateMaxLineWidth(List tokens, int tabWidth = 8)
{
diff --git a/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs b/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs
index 0de61a8..c71d411 100644
--- a/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs
+++ b/tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs
@@ -93,7 +93,8 @@ public void WithNdc_RegistersBothStringAndBinaryParser()
[InlineData("utf-8")]
[InlineData("utf8")]
[InlineData("ascii")]
- [InlineData("unknown")]
+ [InlineData("iso-8859-5")]
+ [InlineData("28595")]
public void ResolveEncoding_ReturnsValidEncoding(string name)
{
var encoding = NdcContentParser.ResolveEncoding(name);
@@ -101,6 +102,16 @@ public void ResolveEncoding_ReturnsValidEncoding(string name)
Assert.NotNull(encoding);
}
+ [Theory]
+ [InlineData("iso-8859-5")]
+ [InlineData("28595")]
+ public void ResolveEncoding_Iso88595_ByNameOrCodePage_ResolvesToCodePage28595(string name)
+ {
+ var encoding = NdcContentParser.ResolveEncoding(name);
+
+ Assert.Equal(28595, encoding.CodePage);
+ }
+
[Fact]
public void ResolveEncoding_Latin1_ReturnsLatin1()
{
@@ -122,8 +133,66 @@ public void ResolveEncoding_Ascii_ReturnsAscii()
}
[Fact]
- public void ResolveEncoding_Unknown_DefaultsToLatin1()
+ public void ResolveEncoding_Unknown_Throws()
+ {
+ var ex = Assert.Throws(
+ () => NdcContentParser.ResolveEncoding("something-else"));
+
+ Assert.Contains("something-else", ex.Message, StringComparison.Ordinal);
+ }
+
+ [Fact]
+ public void ResolveEncoding_UnknownCodePage_Throws()
+ {
+ var ex = Assert.Throws(
+ () => NdcContentParser.ResolveEncoding("999999"));
+
+ Assert.Contains("999999", ex.Message, StringComparison.Ordinal);
+ }
+
+ [Fact]
+ public void ParseBytes_WithIso88595Encoding_DecodesCyrillic()
+ {
+ var parser = new NdcContentParser();
+ // Cyrillic text encoded with ISO-8859-5 (code page 28595). The default
+ // NDC charset uses "none" encoding, so the decoded Unicode survives unchanged.
+ // GetEncoding(28595) succeeds here because the parser's static constructor
+ // already registered the code-pages provider.
+ var text = "Привет";
+ var iso88595 = global::System.Text.Encoding.GetEncoding(28595);
+ var data = iso88595.GetBytes(text);
+ var options = new Dictionary
+ {
+ ["input_encoding"] = "iso-8859-5"
+ };
+
+ var result = parser.Parse(data, EmptyContext, options);
+
+ var root = Assert.IsType(Assert.Single(result));
+ var row = Assert.IsType(root.Children[0]);
+ var textElement = Assert.IsType(row.Children[0]);
+ Assert.Equal("Привет", textElement.Content);
+ }
+
+ [Fact]
+ public void ParseBytes_WithNumericInputEncoding_DecodesCyrillic()
{
- Assert.Same(global::System.Text.Encoding.Latin1, NdcContentParser.ResolveEncoding("something-else"));
+ var parser = new NdcContentParser();
+ // input_encoding arrives as a boxed int (as YAML numeric scalars do), not a string.
+ // The binary path must coerce it to its string form so the numeric code page resolves.
+ var text = "Привет";
+ var iso88595 = global::System.Text.Encoding.GetEncoding(28595);
+ var data = iso88595.GetBytes(text);
+ var options = new Dictionary
+ {
+ ["input_encoding"] = 28595
+ };
+
+ var result = parser.Parse(data, EmptyContext, options);
+
+ var root = Assert.IsType(Assert.Single(result));
+ var row = Assert.IsType(root.Children[0]);
+ var textElement = Assert.IsType(row.Children[0]);
+ Assert.Equal("Привет", textElement.Content);
}
}