Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 46 additions & 12 deletions src/FlexRender.Content.Ndc/NdcContentParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,17 @@ namespace FlexRender.Content.Ndc;
/// </summary>
public sealed class NdcContentParser : IContentParser, IBinaryContentParser
{
/// <summary>
/// Registers the code-pages encoding provider exactly once so that non-Latin1
/// ISO/Windows code pages (e.g. ISO-8859-5 / code page 28595) are available to
/// <see cref="Encoding.GetEncoding(string)"/> and <see cref="Encoding.GetEncoding(int)"/>.
/// Static initialization is thread-safe and runs before any member is accessed.
/// </summary>
static NdcContentParser()
{
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
}

/// <inheritdoc />
public string FormatName => "ndc";

Expand Down Expand Up @@ -43,30 +54,53 @@ public IReadOnlyList<TemplateElement> Parse(ReadOnlyMemory<byte> data, ContentPa
return [];

var encodingName = "latin1";
if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is string encStr)
encodingName = encStr;
if (options is not null && options.TryGetValue("input_encoding", out var enc) && enc is not null)
encodingName = enc.ToString() ?? "latin1";

var encoding = ResolveEncoding(encodingName);
var textContent = encoding.GetString(data.Span);
return Parse(textContent, context, options);
}

/// <summary>
/// Resolves a human-friendly encoding name to a <see cref="System.Text.Encoding"/> instance.
/// Resolves an encoding identifier to a <see cref="System.Text.Encoding"/> instance.
/// </summary>
/// <param name="name">
/// The encoding name. Supported values: <c>latin1</c>, <c>iso-8859-1</c>, <c>utf-8</c>,
/// <c>utf8</c>, <c>ascii</c>. Unrecognized values default to Latin-1.
/// The encoding identifier. All values are resolved through <see cref="Encoding.GetEncoding(string)"/>
/// (encoding names such as <c>latin1</c>, <c>iso-8859-1</c>, <c>iso-8859-5</c>, <c>utf-8</c>, <c>ascii</c>)
/// or <see cref="Encoding.GetEncoding(int)"/> (numeric code pages such as <c>28595</c>). Common names
/// already return the corresponding framework singletons. The dashless <c>utf8</c> alias is mapped to
/// <c>utf-8</c> for backward compatibility. Non-Latin1 ISO/Windows code pages are supported through the
/// registered code-pages provider.
/// </param>
/// <returns>The resolved <see cref="System.Text.Encoding"/>.</returns>
internal static Encoding ResolveEncoding(string name) =>
name.ToLowerInvariant() switch
/// <exception cref="ArgumentNullException">Thrown when <paramref name="name"/> is <see langword="null"/>.</exception>
/// <exception cref="NotSupportedException">
/// Thrown when <paramref name="name"/> does not correspond to a known encoding name or code page.
/// </exception>
internal static Encoding ResolveEncoding(string name)
{
ArgumentNullException.ThrowIfNull(name);

var trimmed = name.Trim();

// GetEncoding does not recognize the dashless "utf8" form; map it to the canonical "utf-8".
if (string.Equals(trimmed, "utf8", StringComparison.OrdinalIgnoreCase))
trimmed = "utf-8";

try
{
"latin1" or "iso-8859-1" => Encoding.Latin1,
"utf-8" or "utf8" => Encoding.UTF8,
"ascii" => Encoding.ASCII,
_ => Encoding.Latin1
};
return int.TryParse(trimmed, System.Globalization.CultureInfo.InvariantCulture, out var codePage)
? Encoding.GetEncoding(codePage)
: Encoding.GetEncoding(trimmed);
}
catch (Exception ex) when (ex is ArgumentException or NotSupportedException)
{
throw new NotSupportedException(
$"Unknown or unsupported input encoding: '{name}'. Use a known encoding name (e.g. 'iso-8859-5') or a numeric code page (e.g. '28595').",
ex);
}
}

private static int CalculateMaxLineWidth(List<NdcToken> tokens, int tabWidth = 8)
{
Expand Down
75 changes: 72 additions & 3 deletions tests/FlexRender.Tests/Content/Ndc/NdcBinaryParserTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -93,14 +93,25 @@ public void WithNdc_RegistersBothStringAndBinaryParser()
[InlineData("utf-8")]
[InlineData("utf8")]
[InlineData("ascii")]
[InlineData("unknown")]
[InlineData("iso-8859-5")]
[InlineData("28595")]
public void ResolveEncoding_ReturnsValidEncoding(string name)
{
var encoding = NdcContentParser.ResolveEncoding(name);

Assert.NotNull(encoding);
}

[Theory]
[InlineData("iso-8859-5")]
[InlineData("28595")]
public void ResolveEncoding_Iso88595_ByNameOrCodePage_ResolvesToCodePage28595(string name)
{
var encoding = NdcContentParser.ResolveEncoding(name);

Assert.Equal(28595, encoding.CodePage);
}

[Fact]
public void ResolveEncoding_Latin1_ReturnsLatin1()
{
Expand All @@ -122,8 +133,66 @@ public void ResolveEncoding_Ascii_ReturnsAscii()
}

[Fact]
public void ResolveEncoding_Unknown_DefaultsToLatin1()
public void ResolveEncoding_Unknown_Throws()
{
var ex = Assert.Throws<NotSupportedException>(
() => NdcContentParser.ResolveEncoding("something-else"));

Assert.Contains("something-else", ex.Message, StringComparison.Ordinal);
}

[Fact]
public void ResolveEncoding_UnknownCodePage_Throws()
{
var ex = Assert.Throws<NotSupportedException>(
() => NdcContentParser.ResolveEncoding("999999"));

Assert.Contains("999999", ex.Message, StringComparison.Ordinal);
}

[Fact]
public void ParseBytes_WithIso88595Encoding_DecodesCyrillic()
{
var parser = new NdcContentParser();
// Cyrillic text encoded with ISO-8859-5 (code page 28595). The default
// NDC charset uses "none" encoding, so the decoded Unicode survives unchanged.
// GetEncoding(28595) succeeds here because the parser's static constructor
// already registered the code-pages provider.
var text = "Привет";
var iso88595 = global::System.Text.Encoding.GetEncoding(28595);
var data = iso88595.GetBytes(text);
var options = new Dictionary<string, object>
{
["input_encoding"] = "iso-8859-5"
};

var result = parser.Parse(data, EmptyContext, options);

var root = Assert.IsType<FlexElement>(Assert.Single(result));
var row = Assert.IsType<FlexElement>(root.Children[0]);
var textElement = Assert.IsType<TextElement>(row.Children[0]);
Assert.Equal("Привет", textElement.Content);
}

[Fact]
public void ParseBytes_WithNumericInputEncoding_DecodesCyrillic()
{
Assert.Same(global::System.Text.Encoding.Latin1, NdcContentParser.ResolveEncoding("something-else"));
var parser = new NdcContentParser();
// input_encoding arrives as a boxed int (as YAML numeric scalars do), not a string.
// The binary path must coerce it to its string form so the numeric code page resolves.
var text = "Привет";
var iso88595 = global::System.Text.Encoding.GetEncoding(28595);
var data = iso88595.GetBytes(text);
var options = new Dictionary<string, object>
{
["input_encoding"] = 28595
};

var result = parser.Parse(data, EmptyContext, options);

var root = Assert.IsType<FlexElement>(Assert.Single(result));
var row = Assert.IsType<FlexElement>(root.Children[0]);
var textElement = Assert.IsType<TextElement>(row.Children[0]);
Assert.Equal("Привет", textElement.Content);
}
}
Loading