1- From faee2893e499bdcaa3a511bcff197366b8a87968 Mon Sep 17 00:00:00 2001
1+ From 9d60c0fda0b51e9374a234c48df36130d2c988ee Mon Sep 17 00:00:00 2001
22From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
33Date: Sat, 26 Aug 2023 15:08:59 +0200
44Subject: [PATCH] Expose line and column information for use in PHP
55
66---
77 source/lexbor/dom/interfaces/node.h | 2 ++
88 source/lexbor/html/token.h | 2 ++
9- source/lexbor/html/tokenizer.c | 22 +++++++++++++++++++++-
9+ source/lexbor/html/tokenizer.c | 24 ++ +++++++++++++++++++++-
1010 source/lexbor/html/tokenizer.h | 2 ++
1111 source/lexbor/html/tokenizer/state.h | 2 ++
1212 source/lexbor/html/tree.c | 11 +++++++++++
1313 source/lexbor/html/tree/error.c | 5 +++--
1414 source/lexbor/html/tree/error.h | 5 +++--
15- 8 files changed, 46 insertions(+), 5 deletions(-)
15+ 8 files changed, 48 insertions(+), 5 deletions(-)
1616
1717diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
1818index 4a10197..ff9c924 100755
@@ -41,7 +41,7 @@ index 79accd0..0b7f4fd 100755
4141 const lxb_char_t *text_start;
4242 const lxb_char_t *text_end;
4343diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
44- index 741bced..a399758 100755
44+ index 741bced..0bd9aec 100755
4545--- a/source/lexbor/html/tokenizer.c
4646+++ b/source/lexbor/html/tokenizer.c
4747@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
@@ -61,29 +61,31 @@ index 741bced..a399758 100755
6161
6262 return LXB_STATUS_OK;
6363 }
64- @@ -312,7 +315,24 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
64+ @@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
6565 tkz->last = end;
6666
6767 while (data < end) {
6868- data = tkz->state(tkz, data, end);
69+ + size_t current_column = tkz->current_column;
6970+ const lxb_char_t *new_data = tkz->state(tkz, data, end);
7071+ while (data < new_data) {
7172+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
7273+ if (*data == '\n') {
7374+ tkz->current_line++;
74- + tkz-> current_column = 0;
75+ + current_column = 0;
7576+ } else {
7677+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
7778+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
7879+ if ((*data & 0b11000000) == 0b10000000) {
7980+ /* Continuation byte, do nothing */
8081+ } else {
8182+ /* First byte for a codepoint */
82- + tkz-> current_column++;
83+ + current_column++;
8384+ }
8485+ }
8586+ data++;
8687+ }
88+ + tkz->current_column = current_column;
8789 }
8890
8991 return tkz->status;
@@ -182,5 +184,5 @@ index 2fd06cb..ed1859f 100755
182184 lxb_html_tree_error_t;
183185
184186- -
185- 2.41 .0
187+ 2.43 .0
186188
0 commit comments