Skip to content

Commit 5614806

Browse files
committed
Tidy lexer and fix error tokens
1 parent d5bf093 commit 5614806

File tree

4 files changed

+66
-93
lines changed

4 files changed

+66
-93
lines changed

jsonpath_rfc9535/lex.py

Lines changed: 46 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,11 @@
1717
RE_WHITESPACE = re.compile(r"[ \n\r\t]+")
1818
RE_PROPERTY = re.compile(r"[\u0080-\uFFFFa-zA-Z_][\u0080-\uFFFFa-zA-Z0-9_-]*")
1919
RE_INDEX = re.compile(r"-?[0-9]+")
20-
RE_INT = re.compile(r"-?[0-9]+")
21-
RE_EXPONENT = re.compile(r"[eE][+-]?[0-9]+")
22-
RE_NEGATIVE_EXPONENT = re.compile(r"[eE]-[0-9]+")
20+
RE_INT = re.compile(r"-?[0-9]+(?:[eE]\+?[0-9]+)?")
21+
# RE_FLOAT includes numbers with a negative exponent and no decimal point.
22+
RE_FLOAT = re.compile(r"(:?-?[0-9]+\.[0-9]+(?:[eE][+-]?[0-9]+)?)|(-?[0-9]+[eE]-[0-9]+)")
2323
RE_FUNCTION_NAME = re.compile(r"[a-z][a-z_0-9]*")
24-
RE_AND = re.compile(r"&&")
25-
RE_OR = re.compile(r"\|\|")
26-
RE_TRUE = re.compile(r"true")
27-
RE_FALSE = re.compile(r"false")
28-
RE_NULL = re.compile(r"null")
29-
RE_ESCAPE = re.compile(r"\\[bfnrtu/]")
24+
ESCAPES = frozenset(["b", "f", "n", "r", "t", "u", "/", "\\"])
3025

3126

3227
class Lexer:
@@ -77,14 +72,13 @@ def emit(self, t: TokenType) -> None:
7772

7873
def next(self) -> str:
7974
"""Return the next character, or the empty string if no more characters."""
80-
# TODO: benchmark ty/except approach
81-
if self.pos >= len(self.query):
75+
try:
76+
c = self.query[self.pos]
77+
self.pos += 1
78+
return c
79+
except IndexError:
8280
return ""
8381

84-
c = self.query[self.pos]
85-
self.pos += 1
86-
return c
87-
8882
def ignore(self) -> None:
8983
"""Ignore characters up to the pointer."""
9084
self.start = self.pos
@@ -101,15 +95,13 @@ def backup(self) -> None:
10195

10296
def peek(self) -> str:
10397
"""Return the next character without advancing the pointer."""
104-
# TODO: benchmark try/except without self.next()
105-
c = self.next()
106-
if c:
107-
self.backup()
108-
return c
98+
try:
99+
return self.query[self.pos]
100+
except IndexError:
101+
return ""
109102

110103
def accept(self, s: str) -> bool:
111104
"""Increment the pointer if the current position starts with _s_."""
112-
# TODO: benchmark using accept instead of accept_match for known words
113105
if self.query.startswith(s, self.pos):
114106
self.pos += len(s)
115107
return True
@@ -141,20 +133,25 @@ def ignore_whitespace(self) -> bool:
141133

142134
def error(self, msg: str) -> None:
143135
"""Emit an error token."""
144-
# TODO: move msg out of Token.value. We'll need the value too when implementing
145136
# better error messages.
146-
self.tokens.append(Token(TokenType.ERROR, msg, self.pos, self.query))
137+
self.tokens.append(
138+
Token(
139+
TokenType.ERROR,
140+
self.query[self.start : self.pos],
141+
self.start,
142+
self.query,
143+
msg,
144+
)
145+
)
147146

148147

149148
StateFn = Callable[[Lexer], Optional["StateFn"]]
150149

151150

152151
def lex_root(l: Lexer) -> Optional[StateFn]: # noqa: D103
153-
# TODO: benchmark peek/next instead of next/backup
154152
c = l.next()
155153

156154
if c != "$":
157-
l.backup()
158155
l.error(f"expected '$', found {c!r}")
159156
return None
160157

@@ -184,9 +181,8 @@ def lex_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0911
184181
l.emit(TokenType.LBRACKET)
185182
return lex_inside_bracketed_segment
186183

187-
# default
188-
l.backup()
189184
if l.filter_depth:
185+
l.backup()
190186
return lex_inside_filter
191187

192188
l.error(f"expected '.', '..' or a bracketed selection, found {c!r}")
@@ -208,21 +204,21 @@ def lex_descendant_segment(l: Lexer) -> Optional[StateFn]: # noqa: D103
208204
l.emit(TokenType.LBRACKET)
209205
return lex_inside_bracketed_segment
210206

211-
# default
212207
l.backup()
213208

214209
if l.accept_match(RE_PROPERTY):
215210
l.emit(TokenType.PROPERTY)
216211
return lex_segment
217212

213+
l.next()
218214
l.error(f"unexpected descendant selection token {c!r}")
219215
return None
220216

221217

222218
def lex_shorthand_selector(l: Lexer) -> Optional[StateFn]: # noqa: D103
223219
l.ignore() # ignore dot
224220

225-
if l.ignore_whitespace():
221+
if l.accept_match(RE_WHITESPACE):
226222
l.error("unexpected whitespace after dot")
227223
return None
228224

@@ -322,11 +318,9 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
322318
return lex_inside_bracketed_segment
323319

324320
if c == "'":
325-
# String literal
326321
return lex_single_quoted_string_inside_filter_expression
327322

328323
if c == '"':
329-
# String literal
330324
return lex_double_quoted_string_inside_filter_expression
331325

332326
if c == "(":
@@ -392,62 +386,31 @@ def lex_inside_filter(l: Lexer) -> Optional[StateFn]: # noqa: D103, PLR0915, PL
392386
l.emit(TokenType.GT)
393387
continue
394388

395-
# default
396389
l.backup()
397390

398-
# numbers
399-
# TODO: try accept_match(RE_FLOAT), including negative exponent
400-
if l.accept_match(RE_INT):
401-
if l.peek() == ".":
402-
# A float
403-
l.next()
404-
if not l.accept_match(RE_INT):
405-
l.error("a fractional digit is required after a decimal point")
406-
return None
407-
408-
l.accept_match(RE_EXPONENT)
409-
l.emit(TokenType.FLOAT)
410-
continue
411-
412-
# An int, or float if exponent is negative
413-
if l.accept_match(RE_NEGATIVE_EXPONENT):
414-
l.emit(TokenType.FLOAT)
415-
else:
416-
l.accept_match(RE_EXPONENT)
417-
l.emit(TokenType.INT)
418-
continue
419-
420-
if l.accept_match(RE_AND):
391+
if l.accept("&&"):
421392
l.emit(TokenType.AND)
422-
continue
423-
424-
if l.accept_match(RE_OR):
393+
elif l.accept("||"):
425394
l.emit(TokenType.OR)
426-
continue
427-
428-
if l.accept_match(RE_TRUE):
395+
elif l.accept("true"):
429396
l.emit(TokenType.TRUE)
430-
continue
431-
432-
if l.accept_match(RE_FALSE):
397+
elif l.accept("false"):
433398
l.emit(TokenType.FALSE)
434-
continue
435-
436-
if l.accept_match(RE_NULL):
399+
elif l.accept("null"):
437400
l.emit(TokenType.NULL)
438-
continue
439-
440-
# functions
441-
if l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(":
401+
elif l.accept_match(RE_FLOAT):
402+
l.emit(TokenType.FLOAT)
403+
elif l.accept_match(RE_INT):
404+
l.emit(TokenType.INT)
405+
elif l.accept_match(RE_FUNCTION_NAME) and l.peek() == "(":
442406
# Keep track of parentheses for this function call.
443407
l.paren_stack.append(1)
444408
l.emit(TokenType.FUNCTION)
445409
l.next()
446410
l.ignore() # ignore LPAREN
447-
continue
448-
449-
l.error(f"unexpected filter selector token {c!r}")
450-
return None
411+
else:
412+
l.error(f"unexpected filter selector token {c!r}")
413+
return None
451414

452415

453416
def lex_string_factory(quote: str, state: StateFn) -> StateFn:
@@ -472,17 +435,15 @@ def _lex_string(l: Lexer) -> Optional[StateFn]:
472435
return state
473436

474437
while True:
475-
head = l.query[l.pos : l.pos + 2]
476438
c = l.next()
477439

478-
if head in ("\\\\", f"\\{quote}"):
479-
l.next()
480-
continue
481-
482-
# TODO: replace use of `head` with peek
483-
if c == "\\" and not RE_ESCAPE.match(head):
484-
l.error("invalid escape")
485-
return None
440+
if c == "\\":
441+
peeked = l.peek()
442+
if peeked in ESCAPES or peeked == quote:
443+
l.next()
444+
else:
445+
l.error("invalid escape")
446+
return None
486447

487448
if not c:
488449
l.error(f"unclosed string starting at index {l.start}")
@@ -528,6 +489,6 @@ def tokenize(query: str) -> List[Token]:
528489
lexer.run()
529490

530491
if tokens and tokens[-1].type_ == TokenType.ERROR:
531-
raise JSONPathSyntaxError(tokens[-1].value, token=tokens[-1])
492+
raise JSONPathSyntaxError(tokens[-1].message, token=tokens[-1])
532493

533494
return tokens

jsonpath_rfc9535/tokens.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,24 +67,26 @@ class Token:
6767
token derives.
6868
"""
6969

70-
__slots__ = ("type_", "value", "index", "query")
70+
__slots__ = ("type_", "value", "index", "query", "message")
7171

7272
def __init__(
7373
self,
7474
type_: TokenType,
7575
value: str,
7676
index: int,
7777
query: str,
78+
message: str | None = None,
7879
) -> None:
7980
self.type_ = type_
8081
self.value = value
8182
self.index = index
8283
self.query = query
84+
self.message = message
8385

8486
def __repr__(self) -> str: # pragma: no cover
8587
return (
8688
f"Token(type={self.type_.name!r}, value={self.value!r}, "
87-
f"index={self.index}, query={self.query!r})"
89+
f"index={self.index}, query={self.query!r}, message={self.message!r})"
8890
)
8991

9092
def __eq__(self, other: object) -> bool:

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,10 @@ no-cov = "cov --no-cov {args}"
6464
test = "pytest {args}"
6565
lint = "ruff check ."
6666
typing = "mypy"
67+
benchmark = "python scripts/benchmark.py"
6768

6869
[[tool.hatch.envs.all.matrix]]
69-
python = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
70+
python = ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13", "pypy3.10"]
7071

7172
[tool.coverage.run]
7273
branch = true

tests/test_lex.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class Case:
5757
description="missing root selector",
5858
query="foo.bar",
5959
want=[
60-
Token(TokenType.ERROR, "expected '$', found 'f'", 0, "foo.bar"),
60+
Token(TokenType.ERROR, "f", 0, "foo.bar", "expected '$', found 'f'"),
6161
],
6262
),
6363
Case(
@@ -67,9 +67,10 @@ class Case:
6767
Token(TokenType.ROOT, "$", 0, "$foo"),
6868
Token(
6969
TokenType.ERROR,
70-
"expected '.', '..' or a bracketed selection, found 'f'",
70+
"f",
7171
1,
7272
"$foo",
73+
"expected '.', '..' or a bracketed selection, found 'f'",
7374
),
7475
],
7576
),
@@ -88,7 +89,13 @@ class Case:
8889
query="$. foo.bar",
8990
want=[
9091
Token(TokenType.ROOT, "$", 0, "$. foo.bar"),
91-
Token(TokenType.ERROR, "unexpected whitespace after dot", 3, "$. foo.bar"),
92+
Token(
93+
TokenType.ERROR,
94+
" ",
95+
2,
96+
"$. foo.bar",
97+
"unexpected whitespace after dot",
98+
),
9299
],
93100
),
94101
Case(
@@ -129,9 +136,10 @@ class Case:
129136
Token(TokenType.DOUBLE_DOT, "..", 1, "$...foo"),
130137
Token(
131138
TokenType.ERROR,
132-
"unexpected descendant selection token '.'",
139+
".",
133140
3,
134141
"$...foo",
142+
"unexpected descendant selection token '.'",
135143
),
136144
],
137145
),
@@ -143,9 +151,10 @@ class Case:
143151
Token(TokenType.DOUBLE_DOT, "..", 1, "$....foo"),
144152
Token(
145153
TokenType.ERROR,
146-
"unexpected descendant selection token '.'",
154+
".",
147155
3,
148156
"$....foo",
157+
"unexpected descendant selection token '.'",
149158
),
150159
],
151160
),

0 commit comments

Comments
 (0)