From 4a7aca63835717ef4d297ecb702dfa988f7b241d Mon Sep 17 00:00:00 2001 From: loovjo Date: Wed, 19 Apr 2023 13:25:36 +0200 Subject: [PATCH 01/10] Decode HTML escapes in the HTML parser --- Cargo.lock | 18 ++++++++++++++++++ Cargo.toml | 1 + src/parse.rs | 4 ++-- src/parse/attrs.rs | 2 +- src/parse/token.rs | 4 ++++ 5 files changed, 26 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e0faaf4..7146952 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,24 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html_editor" version = "0.5.2" +dependencies = [ + "html-escape", +] + +[[package]] +name = "utf8-width" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1" diff --git a/Cargo.toml b/Cargo.toml index e984b14..d8ff85d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,3 +10,4 @@ license = "MIT" keywords = ["html", "parser", "editor", "dom"] [dependencies] +html-escape = "0.2.13" diff --git a/src/parse.rs b/src/parse.rs index 35b20f1..446be4b 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -75,7 +75,7 @@ fn html_to_stack(html: &str) -> Result, String> { let txt_text = String::from_iter(chars_stack); chars_stack = Vec::new(); // Push the text we just got to the token stack. - token_stack.push(Token::Text(txt_text)); + token_stack.push(Token::from_text(txt_text)); } chars_stack.push(ch); } @@ -120,7 +120,7 @@ fn html_to_stack(html: &str) -> Result, String> { } if !chars_stack.is_empty() { let text = String::from_iter(chars_stack); - token_stack.push(Token::Text(text)); + token_stack.push(Token::from_text(text)); } Ok(token_stack) } diff --git a/src/parse/attrs.rs b/src/parse/attrs.rs index d0be766..c0f9d3c 100644 --- a/src/parse/attrs.rs +++ b/src/parse/attrs.rs @@ -70,7 +70,7 @@ pub fn parse(attr_str: String) -> Vec<(String, String)> { attr_pos = AttrPos::Space; let value = String::from_iter(chars_stack); chars_stack = Vec::new(); - value_stack.push(value) + value_stack.push(html_escape::decode_html_entities(&value).into_owned()) } } else { chars_stack.push(ch) diff --git a/src/parse/token.rs b/src/parse/token.rs index 4fe9753..b595548 100644 --- a/src/parse/token.rs +++ b/src/parse/token.rs @@ -83,6 +83,10 @@ impl Token { Self::Comment(comment[4..comment.len() - 3].to_string()) } + pub fn from_text(text: String) -> Self { + Self::Text(html_escape::decode_html_entities(&text).into_owned()) + } + pub fn node(&self) -> Node { self.clone().into_node() } From 1ccc2b50b374aac2ab86b8005b091e45ac7b9811 Mon Sep 17 00:00:00 2001 From: loovjo Date: Wed, 19 Apr 2023 13:26:27 +0200 Subject: [PATCH 02/10] Escape HTML escapes in HTML generation --- src/operation/html.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operation/html.rs b/src/operation/html.rs index de70bee..164dada 100644 --- a/src/operation/html.rs +++ b/src/operation/html.rs @@ -43,7 +43,7 @@ impl Htmlifiable for Element { if v.is_empty() { k.to_string() } else { - format!(r#"{}="{}""#, k, v) + format!(r#"{}="{}""#, k, html_escape::encode_double_quoted_attribute(&v).into_owned()) } }) .collect::>() @@ -67,7 +67,7 @@ impl Htmlifiable for Node { fn html(&self) -> String { match self { Node::Element(element) => element.html(), - Node::Text(text) => text.to_string(), + Node::Text(text) => html_escape::encode_text(text).into_owned(), Node::Comment(comment) => format!("", comment), Node::Doctype(doctype) => match &doctype { Doctype::Html => "".to_string(), From c3633ed9bda23f5910d5c0f884fb07a2d6fddf6c Mon Sep 17 00:00:00 2001 From: loovjo Date: Wed, 19 Apr 2023 13:26:38 +0200 Subject: [PATCH 03/10] Test cases for encoding and decoding HTML escapes --- tests/escapes.rs | 59 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/escapes.rs diff --git a/tests/escapes.rs b/tests/escapes.rs new file mode 100644 index 0000000..28cf568 --- /dev/null +++ b/tests/escapes.rs @@ -0,0 +1,59 @@ +use html_editor::operation::*; +use html_editor::{parse, Node, Element}; + +const HTML: &str = r#" + + + + + I <3 "escaping" + + +
+ + "#; + +#[test] +fn test_parse() { + let html = parse(HTML).unwrap(); + let title_selector = Selector::from("title"); + + let Some(title) = html.query(&title_selector) else { + assert!(false, "Invalid title"); + return; + }; + + assert_eq!(title.name, "title"); + let Some(Node::Text(title_content)) = title.children.get(0) else { + assert!(false, "Invalid title contents"); + return; + }; + assert_eq!(title_content, "I <3 \"escaping\""); + + let div_selector = Selector::from("#testee"); + + let Some(div) = html.query(&div_selector) else { + assert!(false, "Invalid div"); + return; + }; + + assert_eq!( + div.attrs, + vec![ + ("attr".into(), "id-with-\"quotes\"-inside".into()), + ("id".into(), "testee".into()), + ]); +} + +#[test] +fn test_generate() { + let element = Element::new( + "dummy-tag", + vec![("attr-1".into(), "attribute containing < and \" and &".into())], + vec![Node::Text("fake ".into())], + ); + + let generated = element.html(); + assert_eq!(generated, r#"fake <tag>"#); +} + From bed0cfa8343ae349e9ff9a9e7e686da07a34fb7c Mon Sep 17 00:00:00 2001 From: loovjo Date: Wed, 19 Apr 2023 13:58:49 +0200 Subject: [PATCH 04/10] Ensure text inside script and style tags isn't escaped --- src/operation/html.rs | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/operation/html.rs b/src/operation/html.rs index 164dada..30c0d6e 100644 --- a/src/operation/html.rs +++ b/src/operation/html.rs @@ -29,11 +29,27 @@ pub trait Htmlifiable { impl Htmlifiable for Element { fn html(&self) -> String { + let children_html = match self.name.as_str() { + "style" | "script" => { + // + + "#; + + let html = parse(HTML).unwrap(); + + let script_selector = Selector::from("script"); + + let Some(script) = html.query(&script_selector) else { + assert!(false, "Couldn't find script"); + return; + }; + + assert_eq!(script.name, "script"); + let Some(Node::Text(script_content)) = script.children.get(0) else { + assert!(false, "Invalid script contents"); + return; + }; + + assert_eq!(script_content, r#"let text = "this tag shouldn't be escaped ->

hi

""#); + + let style_selector = Selector::from("style"); + + let Some(style) = html.query(&style_selector) else { + assert!(false, "Couldn't find style"); + return; + }; + + assert_eq!(style.name, "style"); + let Some(Node::Text(style_content)) = style.children.get(0) else { + assert!(false, "Invalid script contents"); + return; + }; + + assert_eq!(style_content, r#"main:before { content: "fake tag"; }"#); +} + +#[test] +fn no_escapes_in_script_and_style() { + let element = Element::new( + "head", + vec![], + vec![ + Node::Element(Element::new( + "script", + vec![], + vec![Node::Text(r#"let text = "this tag shouldn't be escaped ->

hi

""#.into())], + )), + Node::Element(Element::new( + "style", + vec![], + vec![Node::Text(r#"main:before { content: "fake tag"; }"#.into())], + )), + ], + ); + + let generated = element.html(); + assert_eq!(generated, r#""#); +} From 88dfaeff5dfa0ecc5ef67795cdbda4d1b1ee954a Mon Sep 17 00:00:00 2001 From: loovjo Date: Wed, 19 Apr 2023 20:32:05 +0200 Subject: [PATCH 06/10] Add documentation to Element::Text --- src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index e8f2628..861f324 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,6 +52,12 @@ pub enum Doctype { #[derive(Debug, Clone)] pub enum Node { Element(Element), + /// A text node in the DOM. The contents of the Text has all entities expanded, + /// for example parsing `I <3 HTML` would result in a `Text("I <3 HTML")` + /// + /// Note that `