From 0f06b1322f1edd81a3b103754b7ff9bf2f860091 Mon Sep 17 00:00:00 2001 From: Diogo Sousa Date: Mon, 25 May 2026 15:16:41 +0100 Subject: [PATCH] Add bail-out handler API for flushing buffered state on graceful bail-out. Graceful bail-out (`MemoryLimitExceeded` or `ContentHandlerError` with the matching flag on) flushes the unparsed input remainder raw to the sink and propagates the error. That is enough for handlers that only transform tokens they see, but handlers that buffer state across the document (e.g. ROFL's email-obfuscation module, which holds up to ~128 chars in a text buffer while deciding whether they belong to an email) lose that state on bail-out and produce a response with a gap. This commit adds a hook that fires once on a graceful bail-out, immediately before the raw flush, and lets handlers append final bytes to the sink: 1. New rewritable unit `BailOut` with a single method `append(content, content_type)`, modelled after `DocumentEnd::append`. The wrapper carries the rewriter's current encoding (after any ``-driven change), so encoding-correctness is automatic. 2. New builder method `Settings::append_bail_out_handler` (and the `RewriteStrSettings` mirror) plus `bail_out!` macro for type-hint ergonomics, parallel to the existing `element!` / `end!` macros. 3. `HandlerTypes` grows a `BailOutHandler<'h>` associated type, with `LocalHandlerTypes` aliasing `BailOutHandler<'h>` and `SendHandlerTypes` aliasing `BailOutHandlerSend<'h>`. Matching `IntoHandler` impls cover both bare-closure cases. 4. `TransformController` grows a `handle_bail_out` method with an empty default impl so existing implementors (test fixtures, parser-trace tool) keep compiling. `HtmlRewriteController` overrides it to iterate the user-registered handlers in registration order. 5. `Dispatcher::run_bail_out_handlers` constructs the `BailOut` wrapper and delegates to the controller. It is invoked from every existing graceful bail-out site in `TransformStream::write()` (3 sites: `Arena::append`, `Parser::parse`, `Arena::init_with`) and `TransformStream::end()` (1 site), gated on `should_bail_out_for(&err)`. Hook output therefore lands in the sink as `[transformed prefix] + [hook output] + [raw remainder]`. 6. `RewritingError` is marked `#[non_exhaustive]` so we can add variants in future minor releases. `match`es still work; only exhaustive external matches need a catch-all arm. The `end()` bail-out site is defensive: it is symmetric with the `write()` sites but is not reachable through normal input. EOF-in-tag / -attribute / -comment emits as text per HTML5, so content-handler errors don't fire from `parser.parse(_, true)`, and memory errors fire earlier in `write()`. Tested implicitly via the shared call path. --- CHANGELOG.md | 11 ++ src/lib.rs | 17 ++- src/rewritable_units/bail_out.rs | 72 +++++++++ src/rewritable_units/mod.rs | 2 + src/rewriter/mod.rs | 235 ++++++++++++++++++++++++++++- src/rewriter/rewrite_controller.rs | 13 +- src/rewriter/settings.rs | 143 +++++++++++++++++- src/transform_stream/dispatcher.rs | 20 ++- src/transform_stream/mod.rs | 25 ++- 9 files changed, 517 insertions(+), 21 deletions(-) create mode 100644 src/rewritable_units/bail_out.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index e7496e23..07a00d83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,17 @@ rewriter flushes remaining input bytes before propagating a handler error, preserving the response. Currently exposed via the Rust API only; the C API still uses the original behavior. +- Added `Settings::append_bail_out_handler()` and the matching `bail_out!` macro, + `BailOut` rewritable unit, and `BailOutHandler` / `BailOutHandlerSend` type aliases. + Bail-out handlers fire immediately before the raw flush of remaining unparsed input on a + graceful bail-out (memory or content-handler error). Handlers receive the + `RewritingError` and a `BailOut` through which they can append final bytes to the sink + via `BailOut::append(content, content_type)`. Intended for handlers that buffer state + across the document (e.g. text-buffering handlers that defer emission) and need to + flush that state on bail-out. +- Marked `RewritingError` `#[non_exhaustive]` so future error variants can be added without + a major version bump. External callers can still `match` on it, but must include a + catch-all `_ =>` arm. - Reworked `Settings`, `MemorySettings` and `RewriteStrSettings` to use a consuming-builder API. Fields are now private; construction is via `::new()` plus chained `with_*` setters and `append_*` methods for the content-handler vectors. This makes future field additions diff --git a/src/lib.rs b/src/lib.rs index e2ff3293..f7190761 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -41,10 +41,10 @@ mod transform_stream; use cfg_if::cfg_if; pub use self::rewriter::{ - AsciiCompatibleEncoding, CommentHandler, DoctypeHandler, DocumentContentHandlers, - ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, HandlerResult, HandlerTypes, - HtmlRewriter, LocalHandlerTypes, MemorySettings, RewriteStrSettings, Settings, TextHandler, - rewrite_str, + AsciiCompatibleEncoding, BailOutHandler, CommentHandler, DoctypeHandler, + DocumentContentHandlers, ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, + HandlerResult, HandlerTypes, HtmlRewriter, LocalHandlerTypes, MemorySettings, + RewriteStrSettings, Settings, TextHandler, rewrite_str, }; pub use self::selectors_vm::Selector; pub use self::transform_stream::OutputSink; @@ -56,9 +56,10 @@ pub use self::transform_stream::OutputSink; /// Rewriting is sequential, so there's no benefit from using the `Send`-compatible rewriter. pub mod send { pub use crate::rewriter::{ - CommentHandlerSend as CommentHandler, DoctypeHandlerSend as DoctypeHandler, - ElementHandlerSend as ElementHandler, EndHandlerSend as EndHandler, - EndTagHandlerSend as EndTagHandler, TextHandlerSend as TextHandler, + BailOutHandlerSend as BailOutHandler, CommentHandlerSend as CommentHandler, + DoctypeHandlerSend as DoctypeHandler, ElementHandlerSend as ElementHandler, + EndHandlerSend as EndHandler, EndTagHandlerSend as EndTagHandler, + TextHandlerSend as TextHandler, }; pub use crate::rewriter::{IntoHandler, SendHandlerTypes}; @@ -95,7 +96,7 @@ pub mod errors { /// HTML content descriptors that can be produced and modified by a rewriter. pub mod html_content { pub use super::rewritable_units::{ - Attribute, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag, + Attribute, BailOut, Comment, ContentType, Doctype, DocumentEnd, Element, EndTag, StartTag, StreamingHandler, StreamingHandlerSink, TextChunk, UserData, }; diff --git a/src/rewritable_units/bail_out.rs b/src/rewritable_units/bail_out.rs new file mode 100644 index 00000000..c8e07b03 --- /dev/null +++ b/src/rewritable_units/bail_out.rs @@ -0,0 +1,72 @@ +use super::{ContentType, StreamingHandlerSink}; +use crate::transform_stream::OutputSink; +use encoding_rs::Encoding; + +/// A rewritable unit that represents the moment the rewriter is about to abandon +/// processing through a graceful bail-out. +/// +/// Bail-out handlers registered via [`Settings::append_bail_out_handler()`] receive a +/// `&mut BailOut` and can emit final bytes into the output sink via [`append()`]. This +/// is the only opportunity for content other handlers have buffered (e.g. text withheld +/// pending a future chunk) to land in the response when the rewriter aborts. +/// +/// Bytes appended via this unit are written *before* the rewriter's own raw flush of +/// remaining unparsed input. The resulting sink order is: +/// +/// 1. Transformed bytes the rewriter already emitted normally. +/// 2. Bytes appended by bail-out handlers, in registration order. +/// 3. The rewriter's raw flush of the chunk's unparsed suffix. +/// +/// [`Settings::append_bail_out_handler()`]: +/// crate::Settings::append_bail_out_handler +/// [`append()`]: Self::append +pub struct BailOut<'a> { + output_sink: &'a mut dyn OutputSink, + encoding: &'static Encoding, +} + +impl<'a> BailOut<'a> { + #[inline] + #[must_use] + pub(crate) fn new(output_sink: &'a mut dyn OutputSink, encoding: &'static Encoding) -> Self { + Self { + output_sink, + encoding, + } + } + + /// Appends `content` at the bail-out point. + /// + /// Subsequent calls to this method append `content` to the previously inserted + /// content within the same bail-out invocation. When multiple bail-out handlers are + /// registered, their `append` calls are concatenated in registration order. + /// + /// `content_type` controls how the content is interpreted before being written to + /// the sink. See [`ContentType`]. + /// + /// # Example + /// + /// ``` + /// use lol_html::{bail_out, Settings}; + /// use lol_html::errors::RewritingError; + /// use lol_html::html_content::ContentType; + /// + /// // A handler that, on content-handler-error bail-out, drops a notice into the sink + /// // before the rewriter's own raw flush of remaining unparsed input. + /// let settings = Settings::new() + /// .with_graceful_bail_out_on_content_handler_error(true) + /// .append_bail_out_handler(bail_out!(|err, bail_out| { + /// if matches!(err, RewritingError::ContentHandlerError(_)) { + /// bail_out.append("", ContentType::Html); + /// } + /// })); + /// # let _ = settings; + /// ``` + #[inline] + pub fn append(&mut self, content: &str, content_type: ContentType) { + StreamingHandlerSink::new(self.encoding, &mut |c| { + self.output_sink.handle_chunk(c); + }) + .write_str(content, content_type); + } +} diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index c26dafb3..8c22fa35 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -4,6 +4,7 @@ pub(crate) use self::mutations::{Mutations, StringChunk}; pub(crate) use self::text_decoder::TextDecoder; pub(crate) use self::text_encoder::{IncompleteUtf8Resync, TextEncoder}; +pub use self::bail_out::*; pub use self::document_end::*; pub use self::element::*; pub use self::mutations::{ContentType, StreamingHandler}; @@ -83,6 +84,7 @@ macro_rules! impl_user_data { #[macro_use] mod mutations; +mod bail_out; mod document_end; mod element; mod streaming_sink; diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index a002a9d8..dbdf7476 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -69,8 +69,12 @@ impl TryFrom<&'static Encoding> for AsciiCompatibleEncoding { /// This error is unrecoverable. The rewriter instance will panic on attempt to use it after such an /// error. /// +/// This enum is marked `#[non_exhaustive]` so that future variants can be added in minor +/// releases. External `match` expressions on `RewritingError` must include a wildcard arm. +/// /// [`write`]: ../struct.HtmlRewriter.html#method.write /// [`end`]: ../struct.HtmlRewriter.html#method.end +#[non_exhaustive] #[derive(Error, Debug)] pub enum RewritingError { /// See [`MemoryLimitExceededError`]. @@ -922,9 +926,11 @@ mod tests { mod fatal_errors { use super::*; - use crate::html_content::Comment; + use crate::html_content::{Comment, ContentType}; use crate::memory::MemoryLimitExceededError; use crate::rewritable_units::{Element, TextChunk}; + use std::cell::Cell; + use std::rc::Rc; fn create_rewriter( max_allowed_memory_usage: usize, @@ -1558,6 +1564,233 @@ mod tests { ); } + // --- Bail-out handler tests --- + // + // The bail-out handler is invoked immediately before the raw flush of remaining + // unparsed input. Handlers can append final bytes to the sink via + // [`BailOut::append`] (`text_buffer`-style flushes in ROFL). + // + // The end()-path bail-out site is symmetric with the write() sites but is not + // reachable through normal input: memory errors fire during write()'s parse, and + // EOF-in-tag/attribute emits as text per HTML5 (so handlers don't fire from + // `parse(_, true)`). Tested implicitly by sharing the same code path with the + // write() sites. + + /// Verifies the hook runs and its output lands in the sink ahead of the raw flush, + /// so callers see `[transformed prefix] + [hook output] + [raw remainder]`. + #[test] + fn test_bail_out_handler_emits_before_raw_flush() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings::new() + .with_memory_settings( + MemorySettings::new() + .with_max_allowed_memory_usage(MAX) + .with_preallocated_parsing_buffer_size(0) + .with_graceful_bail_out_on_memory_limit_exceeded(true), + ) + .append_document_content_handler(doc_comments!(|c| { + c.set_text("TRANSFORMED").unwrap(); + Ok(()) + })) + .append_bail_out_handler(bail_out!(|_err, bail_out| { + bail_out.append("HOOK", ContentType::Text); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + // chunk_1: a comment the handler transforms, plus an unfinished tag that gets + // buffered. chunk_2: trying to append this to the buffer exceeds the limit, so + // the Arena::append bail-out site fires. + let chunk_1 = format!("\"{}",", "r".repeat(50)); + + rewriter.write(chunk_1.as_bytes()).unwrap(); + let err = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert!(matches!(err, RewritingError::MemoryLimitExceeded(_))); + + let output_str = std::str::from_utf8(&output).unwrap(); + let transformed_idx = output_str + .find("") + .expect("transformed comment must be present"); + let hook_idx = output_str + .find("HOOK") + .expect("hook output must be present"); + let raw_idx = output_str + .find("\"")firstmiddle"; + let mut output = Vec::::new(); + let hook_called = Rc::new(Cell::new(false)); + let hook_called_clone = Rc::clone(&hook_called); + + let mut rewriter = HtmlRewriter::new( + Settings::new() + .with_graceful_bail_out_on_content_handler_error(true) + .append_element_content_handler(element!("stop", |_| Err( + "handler refused".into() + ))) + .append_bail_out_handler(bail_out!(move |err, bail_out| { + assert!( + matches!(err, RewritingError::ContentHandlerError(_)), + "expected ContentHandlerError in hook, got {err}", + ); + hook_called_clone.set(true); + bail_out.append("HOOK", ContentType::Text); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter.write(html).unwrap_err(); + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + assert!(hook_called.get(), "bail-out hook must have been called"); + + let output_str = std::str::from_utf8(&output).unwrap(); + assert!( + output_str.contains("HOOK"), + "hook output must appear in sink, got {output_str:?}", + ); + } + + /// Multiple bail-out handlers fire in registration order. The sink receives their + /// appended bytes in the same order. + #[test] + fn test_multiple_bail_out_handlers_fire_in_order() { + const MAX: usize = 100; + + let mut output = Vec::::new(); + let call_order = Rc::new(Cell::new(String::new())); + let order_a = Rc::clone(&call_order); + let order_b = Rc::clone(&call_order); + let order_c = Rc::clone(&call_order); + + let mut rewriter = HtmlRewriter::new( + Settings::new() + .with_memory_settings( + MemorySettings::new() + .with_max_allowed_memory_usage(MAX) + .with_preallocated_parsing_buffer_size(0) + .with_graceful_bail_out_on_memory_limit_exceeded(true), + ) + // Element handler forces lex mode (default tag-scanner mode would + // consume unterminated attributes as text without buffering). + .append_element_content_handler(element!("*", |_| Ok(()))) + .append_bail_out_handler(bail_out!(move |_err, b| { + let mut s = order_a.take(); + s.push('A'); + order_a.set(s); + b.append("A", ContentType::Text); + })) + .append_bail_out_handler(bail_out!(move |_err, b| { + let mut s = order_b.take(); + s.push('B'); + order_b.set(s); + b.append("B", ContentType::Text); + })) + .append_bail_out_handler(bail_out!(move |_err, b| { + let mut s = order_c.take(); + s.push('C'); + order_c.set(s); + b.append("C", ContentType::Text); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + let chunk_1 = format!("\"{}",", "r".repeat(MAX / 2)); + rewriter.write(chunk_1.as_bytes()).unwrap(); + let _ = rewriter.write(chunk_2.as_bytes()).unwrap_err(); + + assert_eq!( + call_order.take(), + "ABC", + "handlers must fire in registration order" + ); + + let output_str = std::str::from_utf8(&output).unwrap(); + let a_idx = output_str.find('A').expect("A in sink"); + let b_idx = output_str.find('B').expect("B in sink"); + let c_idx = output_str.find('C').expect("C in sink"); + + assert!( + a_idx < b_idx && b_idx < c_idx, + "appended bytes must appear in registration order, got {output_str:?}", + ); + } + + /// On normal completion (no error), the bail-out hook is never invoked. + #[test] + fn test_bail_out_handler_not_invoked_on_normal_completion() { + let hook_called = Rc::new(Cell::new(false)); + let hook_called_clone = Rc::clone(&hook_called); + + let mut output = Vec::::new(); + let mut rewriter = HtmlRewriter::new( + Settings::new().append_bail_out_handler(bail_out!(move |_err, _b| { + hook_called_clone.set(true); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + rewriter.write(b"

hello

").unwrap(); + rewriter.end().unwrap(); + + assert!( + !hook_called.get(), + "bail-out hook must not fire on normal completion", + ); + } + + /// When the graceful flag is off, an error still propagates but the bail-out hook + /// is not invoked. The hook is gated by `should_bail_out_for`, just like the raw + /// flush is. + #[test] + fn test_bail_out_handler_not_invoked_when_graceful_flag_disabled() { + let hook_called = Rc::new(Cell::new(false)); + let hook_called_clone = Rc::clone(&hook_called); + + let mut output = Vec::::new(); + // No `with_graceful_bail_out_on_content_handler_error(true)` — flag stays off. + let mut rewriter = HtmlRewriter::new( + Settings::new() + .append_element_content_handler(element!("stop", |_| Err( + "handler refused".into() + ))) + .append_bail_out_handler(bail_out!(move |_err, _b| { + hook_called_clone.set(true); + })), + |c: &[u8]| output.extend_from_slice(c), + ); + + let err = rewriter + .write(b"firstmiddle") + .unwrap_err(); + + assert!(matches!(err, RewritingError::ContentHandlerError(_))); + assert!( + !hook_called.get(), + "bail-out hook must not fire when graceful flag is off", + ); + } + #[test] fn content_handler_error_propagation() { fn assert_err<'h>( diff --git a/src/rewriter/rewrite_controller.rs b/src/rewriter/rewrite_controller.rs index b5cf9351..00095e0a 100644 --- a/src/rewriter/rewrite_controller.rs +++ b/src/rewriter/rewrite_controller.rs @@ -4,7 +4,7 @@ use crate::base::SharedEncoding; use crate::html::{LocalName, Namespace}; use crate::memory::SharedMemoryLimiter; use crate::parser::ActionError; -use crate::rewritable_units::{DocumentEnd, Token, TokenCaptureFlags}; +use crate::rewritable_units::{BailOut, DocumentEnd, Token, TokenCaptureFlags}; use crate::selectors_vm::{ Ast, AuxStartTagInfoRequest, DenseHashSet, ElementData, SelectorMatchingVm, VmError, }; @@ -35,6 +35,7 @@ impl ElementData for ElementDescriptor { pub(crate) struct HtmlRewriteController<'h, H: HandlerTypes> { handlers_dispatcher: ContentHandlersDispatcher<'h, H>, selector_matching_vm: Option>, + bail_out_handlers: Vec>, } impl<'h, H: HandlerTypes> HtmlRewriteController<'h, H> { @@ -83,17 +84,19 @@ impl<'h, H: HandlerTypes> HtmlRewriteController<'h, H> { None }; - Self::new(dispatcher, selector_matching_vm) + Self::new(dispatcher, selector_matching_vm, settings.bail_out_handlers) } #[inline] pub(crate) const fn new( handlers_dispatcher: ContentHandlersDispatcher<'h, H>, selector_matching_vm: Option>, + bail_out_handlers: Vec>, ) -> Self { HtmlRewriteController { handlers_dispatcher, selector_matching_vm, + bail_out_handlers, } } } @@ -188,4 +191,10 @@ impl TransformController for HtmlRewriteController<'_, H> { .handlers_dispatcher .has_matched_elements_with_removed_content() } + + fn handle_bail_out(&mut self, error: &RewritingError, bail_out: &mut BailOut<'_>) { + for handler in &mut self.bail_out_handlers { + handler(error, bail_out); + } + } } diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index eb64da4e..63ff8d95 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -1,7 +1,7 @@ -use crate::rewritable_units::{Comment, Doctype, DocumentEnd, Element, EndTag, TextChunk}; +use crate::rewritable_units::{BailOut, Comment, Doctype, DocumentEnd, Element, EndTag, TextChunk}; use crate::selectors_vm::Selector; // N.B. `use crate::` will break this because the constructor is not public, only the struct itself -use super::AsciiCompatibleEncoding; +use super::{AsciiCompatibleEncoding, RewritingError}; use std::borrow::Cow; use std::error::Error; @@ -35,6 +35,10 @@ pub trait HandlerTypes: Sized { type EndTagHandler<'handler>: FnOnce(&mut EndTag<'_>) -> HandlerResult + 'handler; /// Handler type for [`DocumentEnd`]. type EndHandler<'handler>: FnOnce(&mut DocumentEnd<'_>) -> HandlerResult + 'handler; + /// Handler type for [`BailOut`]: invoked when the rewriter triggers a graceful bail-out. + /// + /// See [`Settings::append_bail_out_handler()`] for details. + type BailOutHandler<'handler>: FnMut(&RewritingError, &mut BailOut<'_>) + 'handler; // Inside the HTML rewriter we need to create handlers, and they need to be the most constrained // possible version of a handler (i.e. if we have `Send` and non-`Send` handlers we need to @@ -71,6 +75,7 @@ impl HandlerTypes for LocalHandlerTypes { type ElementHandler<'h> = ElementHandler<'h>; type EndTagHandler<'h> = EndTagHandler<'h>; type EndHandler<'h> = EndHandler<'h>; + type BailOutHandler<'h> = BailOutHandler<'h>; fn new_end_tag_handler<'h>( handler: impl IntoHandler>, @@ -106,6 +111,7 @@ impl HandlerTypes for SendHandlerTypes { type ElementHandler<'h> = ElementHandlerSend<'h, Self>; type EndTagHandler<'h> = EndTagHandlerSend<'h>; type EndHandler<'h> = EndHandlerSend<'h>; + type BailOutHandler<'h> = BailOutHandlerSend<'h>; fn new_end_tag_handler<'h>( handler: impl IntoHandler>, @@ -148,6 +154,11 @@ pub type ElementHandler<'h, H = LocalHandlerTypes> = pub type EndTagHandler<'h> = Box) -> HandlerResult + 'h>; /// Boxed closure for handling the document end. This is called after the last chunk is processed. pub type EndHandler<'h> = Box) -> HandlerResult + 'h>; +/// Boxed closure for handling a graceful bail-out. Called once if the rewriter triggers a +/// bail-out before propagating the [`RewritingError`]. +/// +/// See [`Settings::append_bail_out_handler()`]. +pub type BailOutHandler<'h> = Box) + 'h>; /// [Sendable](crate::send) boxed closure for handling the [document type declaration]. /// @@ -174,6 +185,10 @@ pub type EndTagHandlerSend<'h> = Box) -> HandlerResul /// /// See also non-sendable [`EndHandler`](crate::EndHandler). pub type EndHandlerSend<'h> = Box) -> HandlerResult + Send + 'h>; +/// [Sendable](crate::send) boxed closure for handling a graceful bail-out. +/// +/// See also non-sendable [`BailOutHandler`](crate::BailOutHandler). +pub type BailOutHandlerSend<'h> = Box) + Send + 'h>; /// Trait that allows closures to be used as handlers #[diagnostic::on_unimplemented( @@ -271,6 +286,20 @@ impl<'h, F: FnOnce(&mut DocumentEnd<'_>) -> HandlerResult + Send + 'h> } } +impl<'h, F: FnMut(&RewritingError, &mut BailOut<'_>) + 'h> IntoHandler> for F { + fn into_handler(self) -> BailOutHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&RewritingError, &mut BailOut<'_>) + Send + 'h> + IntoHandler> for F +{ + fn into_handler(self) -> BailOutHandlerSend<'h> { + Box::new(self) + } +} + /// Specifies element content handlers associated with a selector. pub struct ElementContentHandlers<'h, H: HandlerTypes = LocalHandlerTypes> { /// Element handler. See [`element!`](crate::element) and [`HandlerTypes::ElementHandler`]. @@ -746,6 +775,49 @@ macro_rules! end { }}; } +/// A convenience macro to construct a [bail-out handler](Settings::append_bail_out_handler) for +/// the graceful bail-out path. +/// +/// The handler receives a [`&RewritingError`](crate::errors::RewritingError) and a +/// `&mut `[`BailOut`](crate::html_content::BailOut) through which it can append final bytes +/// to the sink before the rewriter's own raw flush. +/// +/// # Example +/// ``` +/// use lol_html::{bail_out, rewrite_str, RewriteStrSettings}; +/// use lol_html::errors::RewritingError; +/// use lol_html::html_content::ContentType; +/// +/// let result = rewrite_str( +/// r#"foo"#, +/// RewriteStrSettings::new() +/// .append_bail_out_handler(bail_out!(|err, bail_out| { +/// if matches!(err, RewritingError::ContentHandlerError(_)) { +/// bail_out.append("", ContentType::Html); +/// } +/// })), +/// ) +/// .unwrap(); +/// +/// // No bail-out happened, so the handler never fired. +/// assert_eq!(result, "foo"); +/// ``` +#[macro_export(local_inner_macros)] +macro_rules! bail_out { + ($handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + const fn type_hint(h: T) -> T + where + T: FnMut(&$crate::errors::RewritingError, &mut $crate::html_content::BailOut<'_>), + { + h + } + + type_hint($handler) + }}; +} + /// Specifies the memory settings for [`HtmlRewriter`]. /// /// Construct with [`MemorySettings::new()`] (or [`MemorySettings::default()`]) and configure the @@ -901,6 +973,7 @@ pub struct Settings<'handlers, 'selectors, H: HandlerTypes = LocalHandlerTypes> ElementContentHandlers<'handlers, H>, )>, pub(crate) document_content_handlers: Vec>, + pub(crate) bail_out_handlers: Vec>, pub(crate) encoding: AsciiCompatibleEncoding, pub(crate) memory_settings: MemorySettings, pub(crate) strict: bool, @@ -942,6 +1015,7 @@ impl<'handlers, 'selectors, H: HandlerTypes> Settings<'handlers, 'selectors, H> Settings { element_content_handlers: vec![], document_content_handlers: vec![], + bail_out_handlers: vec![], encoding: AsciiCompatibleEncoding(encoding_rs::UTF_8), memory_settings: MemorySettings::new(), strict: true, @@ -1014,6 +1088,55 @@ impl<'handlers, 'selectors, H: HandlerTypes> Settings<'handlers, 'selectors, H> self } + /// Appends a handler to be invoked when the rewriter triggers a graceful bail-out. + /// + /// Bail-out handlers fire when the rewriter is about to abort processing and propagate a + /// [`RewritingError`] through a graceful bail-out (i.e. when one of the + /// `graceful_bail_out_on_*` settings is enabled and the corresponding error fires). Each + /// handler receives the error and a [`BailOut`] through which it can append final bytes to + /// the sink via [`BailOut::append()`]. + /// + /// Handlers fire in registration order, *before* the rewriter's own raw flush of remaining + /// unparsed input. The resulting sink order is: + /// + /// 1. Transformed bytes the rewriter already emitted normally. + /// 2. Bytes appended by bail-out handlers, in registration order. + /// 3. The rewriter's raw flush of the chunk's unparsed suffix. + /// + /// Handlers do not return errors. Any cleanup they cannot complete must be silently + /// abandoned. + /// + /// ### Hint + /// + /// The [`bail_out!`] convenience macro returns a value of the expected type, so it can be + /// passed directly: + /// + /// ``` + /// use lol_html::{bail_out, Settings}; + /// use lol_html::errors::RewritingError; + /// use lol_html::html_content::ContentType; + /// + /// let settings = Settings::new() + /// .with_graceful_bail_out_on_content_handler_error(true) + /// .append_bail_out_handler(bail_out!(|err, bail_out| { + /// if matches!(err, RewritingError::ContentHandlerError(_)) { + /// bail_out.append("", ContentType::Html); + /// } + /// })); + /// # let _ = settings; + /// ``` + /// + /// [`bail_out!`]: macro.bail_out.html + #[inline] + #[must_use] + pub fn append_bail_out_handler( + mut self, + handler: impl IntoHandler>, + ) -> Self { + self.bail_out_handlers.push(handler.into_handler()); + self + } + /// Sets the [character encoding] for the input and the output of the rewriter. /// /// Can be a [label] for any of the web-compatible encodings with an exception for `UTF-16LE`, @@ -1187,6 +1310,7 @@ impl<'h, 's, H: HandlerTypes> From> for Settings<' Settings { element_content_handlers: settings.element_content_handlers, document_content_handlers: settings.document_content_handlers, + bail_out_handlers: settings.bail_out_handlers, strict: settings.strict, enable_esi_tags: settings.enable_esi_tags, ..Settings::new_for_handler_types() @@ -1223,6 +1347,7 @@ pub struct RewriteStrSettings<'handlers, 'selectors, H: HandlerTypes = LocalHand ElementContentHandlers<'handlers, H>, )>, pub(crate) document_content_handlers: Vec>, + pub(crate) bail_out_handlers: Vec>, pub(crate) strict: bool, pub(crate) enable_esi_tags: bool, } @@ -1260,6 +1385,7 @@ impl<'handlers, 'selectors, H: HandlerTypes> RewriteStrSettings<'handlers, 'sele RewriteStrSettings { element_content_handlers: vec![], document_content_handlers: vec![], + bail_out_handlers: vec![], strict: true, enable_esi_tags: true, } @@ -1326,6 +1452,19 @@ impl<'handlers, 'selectors, H: HandlerTypes> RewriteStrSettings<'handlers, 'sele self } + /// Appends a handler to be invoked when the rewriter triggers a graceful bail-out. + /// + /// See [`Settings::append_bail_out_handler()`] for full semantics. Same shape. + #[inline] + #[must_use] + pub fn append_bail_out_handler( + mut self, + handler: impl IntoHandler>, + ) -> Self { + self.bail_out_handlers.push(handler.into_handler()); + self + } + /// If set to `true` the rewriter bails out if it encounters markup that drives the HTML parser /// into ambiguous state. /// diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index ca021631..c8b11ddd 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -9,7 +9,7 @@ use crate::parser::{ }; use crate::rewritable_units::TextDecoder; use crate::rewritable_units::ToTokenResult; -use crate::rewritable_units::{DocumentEnd, Serialize, ToToken, Token, TokenCaptureFlags}; +use crate::rewritable_units::{BailOut, DocumentEnd, Serialize, ToToken, Token, TokenCaptureFlags}; use crate::rewriter::RewritingError; use encoding_rs::Encoding; @@ -44,6 +44,11 @@ pub trait TransformController: Sized { fn handle_token(&mut self, token: &mut Token<'_>) -> Result<(), RewritingError>; fn handle_end(&mut self, document_end: &mut DocumentEnd<'_>) -> Result<(), RewritingError>; fn should_emit_content(&self) -> bool; + + /// Invoked when the rewriter triggers a graceful bail-out. Default impl does nothing; + /// the production `HtmlRewriteController` overrides this to run the user-registered + /// bail-out handlers. + fn handle_bail_out(&mut self, _error: &RewritingError, _bail_out: &mut BailOut<'_>) {} } /// Defines an interface for the [`HtmlRewriter`]'s output. @@ -416,6 +421,19 @@ where self.delegate.remaining_content_start = 0; } + /// Invokes the transform controller's bail-out handlers (in registration order), + /// constructing a [`BailOut`] wrapper around the output sink and the current encoding. + /// Must be called *before* [`flush_for_bail_out()`] so that handler emissions land in + /// the sink ahead of the raw flush of remaining unparsed input. + /// + /// [`flush_for_bail_out()`]: Self::flush_for_bail_out + pub fn run_bail_out_handlers(&mut self, error: &RewritingError) { + let mut bail_out = BailOut::new(&mut self.delegate.output_sink, self.encoding.get()); + self.delegate + .transform_controller + .handle_bail_out(error, &mut bail_out); + } + pub fn finish(&mut self, input: &[u8]) -> Result<(), RewritingError> { self.delegate.finish(self.encoding.get(), input) } diff --git a/src/transform_stream/mod.rs b/src/transform_stream/mod.rs index 9d9fe932..1d4fa08a 100644 --- a/src/transform_stream/mod.rs +++ b/src/transform_stream/mod.rs @@ -107,13 +107,16 @@ where // previous calls. Neither chunk has been emitted to the sink yet, so on a // graceful bail-out we flush both as-is and let the caller continue the // response from where they were. - if self.graceful_bail_out_on_memory_limit_exceeded { + let err = RewritingError::MemoryLimitExceeded(e); + + if self.should_bail_out_for(&err) { let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&err); dispatcher.flush_for_bail_out(self.buffer.bytes()); dispatcher.flush_for_bail_out(data); } - return Err(RewritingError::MemoryLimitExceeded(e)); + return Err(err); } } } else { @@ -131,7 +134,9 @@ where // between `emit_chunk_before_lexeme()` and `consume_lexeme()`). Flushing from // there preserves all bytes the caller fed us. if self.should_bail_out_for(&e) { - self.parser.get_dispatcher().flush_for_bail_out(chunk); + let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&e); + dispatcher.flush_for_bail_out(chunk); } return Err(e); @@ -150,11 +155,15 @@ where // Parsing succeeded but we can't buffer the leftover bytes for the next // call. On a graceful bail-out we flush the leftover raw so the response // stays whole. - if self.graceful_bail_out_on_memory_limit_exceeded { - self.parser.get_dispatcher().flush_for_bail_out(unconsumed); + let err = RewritingError::MemoryLimitExceeded(e); + + if self.should_bail_out_for(&err) { + let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&err); + dispatcher.flush_for_bail_out(unconsumed); } - return Err(RewritingError::MemoryLimitExceeded(e)); + return Err(err); } self.has_buffered_data = true; @@ -183,7 +192,9 @@ where // Same reasoning as in `write()`: if we can bail out gracefully, make sure the sink // has all the input bytes before propagating the error. if self.should_bail_out_for(&e) { - self.parser.get_dispatcher().flush_for_bail_out(chunk); + let dispatcher = self.parser.get_dispatcher(); + dispatcher.run_bail_out_handlers(&e); + dispatcher.flush_for_bail_out(chunk); } return Err(e);